fix somethings

2026-06-08 11:16:28 +08:00
parent 9fea9c6a53
commit e7963b267e
34 changed files with 5195 additions and 246 deletions
--- a/.env
+++ b/.env
@@ -54,6 +54,11 @@ DOCUMENT_REPOSITORY_BACKEND=json
 # Default false: processing runs in FastAPI's threadpool — no external worker needed.
 USE_CELERY_WORKER=false
 # ===== 法规感知爬取配置 =====
 PERCEPTION_CRAWL_TIMEOUT_SECONDS=120
 PERCEPTION_MAX_EVENTS_PER_SOURCE=100
 PERCEPTION_DIFF_SIMILARITY_THRESHOLD=0.85
 # ===== API配置 =====
 API_HOST=0.0.0.0
 API_PORT=8000
--- a/.env.example
+++ b/.env.example
@@ -55,6 +55,11 @@ DOCUMENT_REPOSITORY_BACKEND=json
 # Default false: document processing runs in FastAPI's threadpool (no external worker needed).
 USE_CELERY_WORKER=false
 # ===== 法规感知爬取配置 =====
 PERCEPTION_CRAWL_TIMEOUT_SECONDS=120
 PERCEPTION_MAX_EVENTS_PER_SOURCE=100
 PERCEPTION_DIFF_SIMILARITY_THRESHOLD=0.85
 # ===== 阿里云文档解析 =====
 ALIBABA_ACCESS_KEY_ID=your_aliyun_access_key_id
 ALIBABA_ACCESS_KEY_SECRET=your_aliyun_access_key_secret
--- a/backend/app/api/routes/perception.py
+++ b/backend/app/api/routes/perception.py
@@ -4,10 +4,12 @@ from __future__ import annotations
 import json
-from fastapi import APIRouter, Query
+from fastapi import APIRouter, Depends, Query
 from fastapi.responses import StreamingResponse
-from app.shared.bootstrap import get_perception_service
+from app.shared.bootstrap import get_crawl_service, get_event_store, get_perception_service
 from app.api.dependencies.auth import get_current_user
 from app.domain.auth.models import UserClaims
 from app.shared.async_utils import iter_in_thread
 router = APIRouter(prefix="/perception", tags=["智能感知"])
@@ -65,3 +67,77 @@ async def analyze_event(event_id: str):
            "X-Accel-Buffering": "no",
        },
    )
@router.post("/crawl")
 async def run_crawl(
    body: dict = None,
    current_user: UserClaims = Depends(get_current_user),
 ):
    """Trigger manual crawl of regulatory sources. Streams SSE progress.
    Body (optional): {"sources": ["CATARC", "国标委·强制性", "EUR-Lex"]}
    Omit sources to crawl all registered sources.
    """
    sources: list[str] | None = (body or {}).get("sources")
    crawl_svc = get_crawl_service()
    async def crawl_stream():
        async for item in iter_in_thread(crawl_svc.run_crawl(sources=sources)):
            event_name = item.get("event", "message")
            data = item.get("data", "")
            if isinstance(data, (dict, list)):
                data = json.dumps(data, ensure_ascii=False)
            yield f"event: {event_name}\ndata: {data}\n\n"
    return StreamingResponse(
        crawl_stream(),
        media_type="text/event-stream",
        headers={"Cache-Control": "no-cache", "X-Accel-Buffering": "no"},
    )
@router.post("/events/{event_id}/process")
 async def process_event(
    event_id: str,
    current_user: UserClaims = Depends(get_current_user),
 ):
    """Trigger LLM pipeline (extract + assess + diff) for a single event."""
    from datetime import UTC, datetime
    from app.infrastructure.perception.llm_pipeline import LlmPipeline
    from app.shared.bootstrap import get_retrieval_service
    event = get_perception_service().get_event(event_id)
    if not event:
        from fastapi import HTTPException
        raise HTTPException(status_code=404, detail=f"Event {event_id} not found")
    store = get_event_store()
    pipeline = LlmPipeline()
    structure = pipeline.extract_structure(event)
    event.update(structure)
    event["affected_docs"] = pipeline.assess_impact(event, get_retrieval_service())
    event["processed_at"] = datetime.now(UTC).isoformat()
    store.upsert(event)
    return {"status": "ok", "event_id": event_id, "processed_at": event["processed_at"]}
@router.get("/events/{event_id}/diff")
 async def get_event_diff(event_id: str):
    """Return semantic diff detail for an event (only available if previously crawled twice)."""
    event = get_perception_service().get_event(event_id)
    if not event:
        from fastapi import HTTPException
        raise HTTPException(status_code=404, detail=f"Event {event_id} not found")
    if not event.get("change_summary"):
        from fastapi import HTTPException
        raise HTTPException(status_code=404, detail="No diff available for this event")
    return {
        "event_id": event_id,
        "change_summary": event.get("change_summary"),
        "changed_sections": event.get("changed_sections") or [],
        "previous_hash": event.get("previous_hash"),
        "content_hash": event.get("content_hash"),
    }
--- a/backend/app/application/perception/crawl_service.py
+++ b/backend/app/application/perception/crawl_service.py
@@ -0,0 +1,147 @@
 """Orchestrates regulatory source crawlers and LLM enrichment pipeline."""
 from __future__ import annotations
 import hashlib
 from typing import Any, Generator
 from loguru import logger
 from app.infrastructure.perception.base_event_store import BaseEventStore
 from app.infrastructure.perception.crawlers.base import BaseCrawler, RawEvent
 from app.infrastructure.perception.llm_pipeline import LlmPipeline
 def _event_id(source: str, standard_code: str) -> str:
    """Deterministic 12-char ID from source + standard_code."""
    return hashlib.sha256(f"{source}-{standard_code}".encode()).hexdigest()[:12]
 def _content_hash(raw_text: str) -> str:
    return hashlib.sha256(raw_text.encode()).hexdigest()
 def _raw_to_dict(raw: RawEvent, event_id: str, content_hash: str) -> dict:
    return {
        "id": event_id,
        "source": raw.source,
        "source_label": raw.source_label,
        "standard_code": raw.standard_code,
        "title": raw.title,
        "summary": raw.summary,
        "full_text_url": raw.full_text_url,
        "status": raw.status,
        "impact_level": "medium",
        "published_at": raw.published_at,
        "effective_at": raw.effective_at,
        "category": raw.category,
        "tags": raw.tags,
        "content_hash": content_hash,
        "previous_hash": None,
    }
 class CrawlService:
    """Orchestrate crawlers, hash-based change detection, and LLM enrichment."""
    def __init__(
        self,
        crawlers: dict[str, BaseCrawler],
        event_store: BaseEventStore,
        llm_pipeline: LlmPipeline,
        retrieval_service: Any,
    ) -> None:
        self._crawlers = crawlers
        self._store = event_store
        self._pipeline = llm_pipeline
        self._retrieval = retrieval_service
    def run_crawl(
        self, sources: list[str] | None = None
    ) -> Generator[dict, None, None]:
        """Run crawl for selected sources. Yields SSE-ready progress dicts."""
        targets = sources or list(self._crawlers.keys())
        total_new = 0
        total_updated = 0
        for source_key in targets:
            crawler = self._crawlers.get(source_key)
            if not crawler:
                yield {"event": "error", "data": f"Unknown source: {source_key}"}
                continue
            yield {"event": "progress", "data": {"source": source_key, "stage": "fetching"}}
            try:
                raw_events = crawler.fetch(limit=100)
            except Exception as exc:
                logger.exception("Crawler failed source={}", source_key)
                yield {"event": "error", "data": {"source": source_key, "message": str(exc)}}
                continue
            yield {
                "event": "progress",
                "data": {"source": source_key, "stage": "processing", "fetched": len(raw_events)},
            }
            new_count = 0
            updated_count = 0
            for raw in raw_events:
                eid = _event_id(raw.source, raw.standard_code)
                new_hash = _content_hash(raw.raw_text or raw.title)
                existing = self._store.get(eid)
                if existing and existing.get("content_hash") == new_hash:
                    continue
                is_update = existing is not None
                old_text = existing.get("summary", "") if is_update else ""
                previous_hash = existing.get("content_hash") if is_update else None
                event_dict = _raw_to_dict(raw, eid, new_hash)
                event_dict["previous_hash"] = previous_hash
                try:
                    structure = self._pipeline.extract_structure(event_dict)
                    event_dict.update(structure)
                except Exception as exc:
                    logger.warning("Structure extraction failed id={} err={}", eid, exc)
                try:
                    affected = self._pipeline.assess_impact(event_dict, self._retrieval)
                    event_dict["affected_docs"] = affected
                except Exception as exc:
                    logger.warning("Impact assessment failed id={} err={}", eid, exc)
                if is_update and old_text and raw.raw_text:
                    try:
                        diff = self._pipeline.compute_diff(old_text, raw.raw_text)
                        event_dict["change_summary"] = diff.get("change_summary")
                        event_dict["changed_sections"] = diff.get("changed_sections")
                    except Exception as exc:
                        logger.warning("Diff failed id={} err={}", eid, exc)
                self._store.upsert(event_dict)
                if is_update:
                    updated_count += 1
                else:
                    new_count += 1
            total_new += new_count
            total_updated += updated_count
            yield {
                "event": "progress",
                "data": {
                    "source": source_key,
                    "stage": "done",
                    "new": new_count,
                    "updated": updated_count,
                },
            }
        yield {
            "event": "done",
            "data": {"total_new": total_new, "total_updated": total_updated},
        }
--- a/backend/app/application/perception/services.py
+++ b/backend/app/application/perception/services.py
@@ -6,7 +6,7 @@ import json
 from typing import Generator
 from app.application.knowledge.services import KnowledgeRetrievalService
-from app.infrastructure.perception.mock_event_store import MockEventStore
+from app.infrastructure.perception.base_event_store import BaseEventStore
 from app.services.llm.llm_factory import get_llm_client
 from app.config.settings import settings
@@ -22,7 +22,7 @@ class PerceptionService:
    def __init__(
        self,
-        event_store: MockEventStore,
+        event_store: BaseEventStore,
        retrieval_service: KnowledgeRetrievalService,
    ) -> None:
        self._store = event_store
--- a/backend/app/config/settings.py
+++ b/backend/app/config/settings.py
@@ -87,6 +87,18 @@ class Settings(BaseSettings):
    # no external worker needed. Switch to True only when a Celery worker is running.
    use_celery_worker: bool = Field(default=False, description="使用 Celery Worker 异步处理文档 (需要 Worker 运行中)")
    # ── Perception crawl ──────────────────────────────────────────────────────
    perception_crawl_timeout_seconds: int = Field(
        default=120, description="HTTP timeout for regulatory source crawlers."
    )
    perception_max_events_per_source: int = Field(
        default=100, description="Maximum events fetched per source per crawl run."
    )
    perception_diff_similarity_threshold: float = Field(
        default=0.85,
        description="Cosine similarity below which a paragraph is flagged as changed.",
    )
    # Keep configuration setup explicit so runtime behavior is easy to reason about.
    api_host: str = Field(default="0.0.0.0", description="API服务地址")
    api_port: int = Field(default=8000, description="API服务端口")
--- a/backend/app/infrastructure/perception/base_event_store.py
+++ b/backend/app/infrastructure/perception/base_event_store.py
@@ -0,0 +1,39 @@
 """Abstract base class for regulatory event stores."""
 from __future__ import annotations
 from abc import ABC, abstractmethod
 class BaseEventStore(ABC):
    """Port interface for regulatory event persistence."""
    @abstractmethod
    def all(self) -> list[dict]:
        """Return all events, most-recent first."""
    @abstractmethod
    def get(self, event_id: str) -> dict | None:
        """Return a single event by ID, or None."""
    @abstractmethod
    def filter(
        self,
        *,
        source: str | None = None,
        impact_level: str | None = None,
        limit: int = 50,
    ) -> list[dict]:
        """Return filtered events sorted by published_at descending."""
    @abstractmethod
    def stats(self) -> dict:
        """Return {total, high_impact, medium_impact, low_impact, recent_90d}."""
    @abstractmethod
    def upsert(self, event: dict) -> None:
        """Insert or update an event record."""
    @abstractmethod
    def get_by_standard_code(self, standard_code: str) -> dict | None:
        """Return the most-recent event with matching standard_code, or None."""
--- a/backend/app/infrastructure/perception/crawlers/init.py
+++ b/backend/app/infrastructure/perception/crawlers/init.py
--- a/backend/app/infrastructure/perception/crawlers/_utils.py
+++ b/backend/app/infrastructure/perception/crawlers/_utils.py
@@ -0,0 +1,43 @@
 """Shared utility functions for crawlers."""
 from __future__ import annotations
 import re
 from datetime import date
 def parse_date(text: str) -> str:
    """Return YYYY-MM-DD from common Chinese date formats, or today's date."""
    text = text.strip()
    if not text:
        return date.today().isoformat()
    m = re.search(r"(\d{4})[/-](\d{1,2})[/-](\d{1,2})", text)
    if m:
        try:
            return date(int(m.group(1)), int(m.group(2)), int(m.group(3))).isoformat()
        except ValueError:
            pass
    m2 = re.search(r"(\d{4})年(\d{1,2})月(\d{1,2})日?", text)
    if m2:
        try:
            return date(int(m2.group(1)), int(m2.group(2)), int(m2.group(3))).isoformat()
        except ValueError:
            pass
    return date.today().isoformat()
 def extract_tags(standard_code: str, title: str) -> list[str]:
    """Derive simple keyword tags from standard code and title."""
    tags: list[str] = []
    code_upper = standard_code.upper()
    if "GB" in code_upper:
        tags.append("国家标准")
    if "/T" in code_upper:
        tags.append("推荐性")
    else:
        tags.append("强制性")
    keywords = ["电动", "安全", "自动驾驶", "充电", "智能网联", "碰撞", "排放", "网络安全"]
    for kw in keywords:
        if kw in title:
            tags.append(kw)
    return tags[:5]
--- a/backend/app/infrastructure/perception/crawlers/base.py
+++ b/backend/app/infrastructure/perception/crawlers/base.py
@@ -0,0 +1,32 @@
 """Shared contracts for regulatory source crawlers."""
 from __future__ import annotations
 from abc import ABC, abstractmethod
 from dataclasses import dataclass, field
@dataclass
 class RawEvent:
    """Raw regulatory event returned by a crawler before enrichment."""
    source: str
    source_label: str
    standard_code: str
    title: str
    summary: str
    full_text_url: str
    status: str           # 'enacted' | 'draft' | 'consultation'
    published_at: str     # YYYY-MM-DD string
    effective_at: str | None
    category: str
    tags: list[str] = field(default_factory=list)
    raw_text: str = ""    # full crawled text for hashing + LLM
 class BaseCrawler(ABC):
    """Abstract regulatory source crawler."""
    @abstractmethod
    def fetch(self, limit: int = 50) -> list[RawEvent]:
        """Fetch up to `limit` recent events from the data source."""
--- a/backend/app/infrastructure/perception/crawlers/catarc_crawler.py
+++ b/backend/app/infrastructure/perception/crawlers/catarc_crawler.py
@@ -0,0 +1,83 @@
 """Crawler for CATARC automotive standard catalogue."""
 from __future__ import annotations
 from urllib.parse import urljoin
 import httpx
 from bs4 import BeautifulSoup
 from loguru import logger
 from app.infrastructure.perception.crawlers.base import BaseCrawler, RawEvent
 from ._utils import extract_tags, parse_date
 _BASE_URL = "https://www.catarc.org.cn/bzzxd/qcbz/index.html"
 _HOST = "https://www.catarc.org.cn"
 _STATUS_MAP = {
    "现行": "enacted",
    "即将实施": "enacted",
    "废止": "enacted",
    "征求意见": "consultation",
    "报批": "draft",
 }
 class CatarcCrawler(BaseCrawler):
    """Scrape the CATARC automotive standard list page."""
    def fetch(self, limit: int = 50) -> list[RawEvent]:
        events: list[RawEvent] = []
        page = 1
        max_pages = max(10, limit)
        while len(events) < limit and page <= max_pages:
            url = f"{_BASE_URL}?page={page}"
            try:
                resp = httpx.get(url, timeout=30, follow_redirects=True)
                resp.raise_for_status()
            except Exception as exc:
                logger.warning("CATARC fetch failed page={} err={}", page, exc)
                break
            soup = BeautifulSoup(resp.text, "lxml")
            rows = soup.select("table tr")
            if not rows:
                break
            batch: list[RawEvent] = []
            for row in rows:
                cells = row.find_all("td")
                if len(cells) < 3:
                    continue
                link = cells[0].find("a")
                standard_code = link.get_text(strip=True) if link else cells[0].get_text(strip=True)
                title = cells[1].get_text(strip=True) if len(cells) > 1 else standard_code
                date_text = cells[2].get_text(strip=True) if len(cells) > 2 else ""
                published_at = parse_date(date_text)
                status_text = cells[3].get_text(strip=True) if len(cells) > 3 else ""
                status = _STATUS_MAP.get(status_text, "enacted")
                detail_url = urljoin(_HOST, link["href"]) if link and link.get("href") else url
                raw_text = f"{standard_code} {title}"
                batch.append(RawEvent(
                    source="CATARC",
                    source_label="全国汽车标准化技术委员会",
                    standard_code=standard_code,
                    title=title,
                    summary=title,
                    full_text_url=detail_url,
                    status=status,
                    published_at=published_at,
                    effective_at=None,
                    category="汽车标准",
                    tags=extract_tags(standard_code, title),
                    raw_text=raw_text,
                ))
            if not batch:
                break
            events.extend(batch)
            page += 1
        return events[:limit]
--- a/backend/app/infrastructure/perception/crawlers/eurlex_crawler.py
+++ b/backend/app/infrastructure/perception/crawlers/eurlex_crawler.py
@@ -0,0 +1,117 @@
 """Crawler for EUR-Lex RSS feeds covering EU AI Act and automotive regulations."""
 from __future__ import annotations
 import re
 from email.utils import parsedate_to_datetime
 import httpx
 from bs4 import BeautifulSoup
 from loguru import logger
 from app.infrastructure.perception.crawlers.base import BaseCrawler, RawEvent
 from ._utils import parse_date
 _EURLEX_RSS_URLS = [
    "https://eur-lex.europa.eu/rss-feed/OJ-L.rss",
 ]
 _AUTOMOTIVE_KEYWORDS = [
    "vehicle", "automotive", "motor", "tyre", "emission", "ADAS", "autonomous",
    "AI Act", "artificial intelligence", "cybersecurity", "software update",
    "R155", "R156", "汽车", "车辆",
 ]
 _AUTOMOTIVE_KEYWORDS_LOWER = [kw.lower() for kw in _AUTOMOTIVE_KEYWORDS]
 def _is_automotive_relevant(title: str, description: str) -> bool:
    combined = (title + " " + description).lower()
    return any(kw in combined for kw in _AUTOMOTIVE_KEYWORDS_LOWER)
 def _extract_celex(url: str) -> str:
    m = re.search(r"CELEX[:/]([0-9A-Z]+)", url)
    return m.group(1) if m else ""
 def _parse_rss_date(rfc2822: str) -> str:
    try:
        dt = parsedate_to_datetime(rfc2822)
        return dt.date().isoformat()
    except Exception:
        return parse_date(rfc2822)
 class EurlexCrawler(BaseCrawler):
    """Fetch automotive-relevant EU regulations from EUR-Lex RSS feeds."""
    def fetch(self, limit: int = 50) -> list[RawEvent]:
        events: list[RawEvent] = []
        for rss_url in _EURLEX_RSS_URLS:
            if len(events) >= limit:
                break
            try:
                resp = httpx.get(rss_url, timeout=30, follow_redirects=True)
                resp.raise_for_status()
            except Exception as exc:
                logger.warning("EUR-Lex RSS fetch failed url={} err={}", rss_url, exc)
                continue
            soup = BeautifulSoup(resp.content, "lxml-xml")
            for item in soup.find_all("item"):
                if len(events) >= limit:
                    break
                title_tag = item.find("title")
                title = title_tag.get_text(strip=True) if title_tag else ""
                desc_tag = item.find("description")
                description = desc_tag.get_text(strip=True) if desc_tag else ""
                link_tag = item.find("link")
                link = link_tag.get_text(strip=True) if link_tag else ""
                pub_date_tag = item.find("pubDate")
                pub_date = pub_date_tag.get_text(strip=True) if pub_date_tag else ""
                if not _is_automotive_relevant(title, description):
                    continue
                celex = _extract_celex(link)
                standard_code = celex if celex else title[:60]
                published_at = _parse_rss_date(pub_date) if pub_date else ""
                events.append(RawEvent(
                    source="EUR-Lex",
                    source_label="欧盟官方公报",
                    standard_code=standard_code,
                    title=title,
                    summary=description[:500],
                    full_text_url=link,
                    status="enacted",
                    published_at=published_at,
                    effective_at=None,
                    category="EU法规",
                    tags=_extract_eurlex_tags(title, description),
                    raw_text=f"{title}\n{description}",
                ))
        return events[:limit]
 def _extract_eurlex_tags(title: str, description: str) -> list[str]:
    combined = title + " " + description
    tag_map = {
        "AI Act": "EU AI Act",
        "artificial intelligence": "EU AI Act",
        "R155": "UN R155",
        "R156": "UN R156",
        "cybersecurity": "网络安全",
        "emission": "排放",
        "autonomous": "自动驾驶",
        "ADAS": "ADAS",
    }
    combined_lower = combined.lower()
    tags = []
    for kw, tag in tag_map.items():
        if kw.lower() in combined_lower:
            tags.append(tag)
    return tags[:5]
--- a/backend/app/infrastructure/perception/crawlers/guobiao_crawler.py
+++ b/backend/app/infrastructure/perception/crawlers/guobiao_crawler.py
@@ -0,0 +1,92 @@
 """Crawlers for the 国标委 (SAMR) standard information platform."""
 from __future__ import annotations
 import httpx
 from loguru import logger
 from app.infrastructure.perception.crawlers.base import BaseCrawler, RawEvent
 from ._utils import extract_tags, parse_date
 _BASE_URL = "https://openstd.samr.gov.cn/bzgk/std/std_list_type"
 _HEADERS = {"User-Agent": "Mozilla/5.0 (compatible; RegulatoryBot/1.0)"}
 def _fetch_page(std_type: int, page: int, page_size: int) -> list[dict]:
    params = {
        "p.p1": std_type,
        "p.p2": "车",
        "p.p90": "circulation_date",
        "p.p91": "desc",
        "p.p6": page,
        "p.p7": page_size,
    }
    try:
        resp = httpx.get(_BASE_URL, params=params, headers=_HEADERS, timeout=30)
        resp.raise_for_status()
        data = resp.json()
        return data.get("rows", []) or []
    except Exception as exc:
        logger.warning("国标委 fetch failed type={} page={} err={}", std_type, page, exc)
        return []
 def _row_to_raw_event(row: dict, source_label: str) -> RawEvent:
    standard_code = row.get("std_code", "")
    title = row.get("std_name", standard_code)
    published_at = parse_date(row.get("release_date", ""))
    effective_at_raw = row.get("implement_date", "")
    effective_at = parse_date(effective_at_raw) if effective_at_raw else None
    status_text = row.get("std_status", "")
    if "征求意见" in status_text:
        status = "consultation"
    elif "报批" in status_text or "草案" in status_text:
        status = "draft"
    else:
        status = "enacted"
    return RawEvent(
        source="国标委",
        source_label=source_label,
        standard_code=standard_code,
        title=title,
        summary=title,
        full_text_url=f"https://openstd.samr.gov.cn/bzgk/std/detail?id={row.get('id', '')}",
        status=status,
        published_at=published_at,
        effective_at=effective_at,
        category=row.get("std_type", "国家标准"),
        tags=extract_tags(standard_code, title),
        raw_text=f"{standard_code} {title}",
    )
 class GuobiaoMandatoryCrawler(BaseCrawler):
    """Fetch mandatory national standards (强制性) related to vehicles."""
    def fetch(self, limit: int = 50) -> list[RawEvent]:
        events: list[RawEvent] = []
        page = 1
        max_pages = max(10, limit)
        while len(events) < limit and page <= max_pages:
            rows = _fetch_page(std_type=1, page=page, page_size=20)
            if not rows:
                break
            events.extend(_row_to_raw_event(r, "国标委·强制性") for r in rows)
            page += 1
        return events[:limit]
 class GuobiaoRecommendedCrawler(BaseCrawler):
    """Fetch recommended national standards (推荐性) related to vehicles."""
    def fetch(self, limit: int = 50) -> list[RawEvent]:
        events: list[RawEvent] = []
        page = 1
        max_pages = max(10, limit)
        while len(events) < limit and page <= max_pages:
            rows = _fetch_page(std_type=2, page=page, page_size=20)
            if not rows:
                break
            events.extend(_row_to_raw_event(r, "国标委·推荐性") for r in rows)
            page += 1
        return events[:limit]
--- a/backend/app/infrastructure/perception/llm_pipeline.py
+++ b/backend/app/infrastructure/perception/llm_pipeline.py
@@ -0,0 +1,241 @@
 """LLM-driven pipeline for regulatory event enrichment."""
 from __future__ import annotations
 import json
 import math
 from typing import Any
 from loguru import logger
 from app.config.settings import settings
 from app.infrastructure.embedding.openai_compatible_embedding_provider import (
    OpenAICompatibleEmbeddingProvider,
 )
 from app.services.llm.llm_factory import get_llm_client
 _EXTRACT_SYSTEM = (
    "You are a regulatory compliance expert specialising in automotive standards "
    "(GB, UN-ECE, ISO, EU). Extract structured information from regulation text. "
    "Return valid JSON only — no markdown fences, no extra keys."
 )
 _ASSESS_SYSTEM = (
    "You are an automotive compliance analyst. Given a regulation and related document excerpts, "
    "identify which documents are affected and what actions are required. "
    "Return a JSON array only."
 )
 _DIFF_SYSTEM = (
    "You are a regulatory change analyst. Given an old and new version of a regulation paragraph, "
    "classify the type of change and summarise it. "
    "Return JSON only: {\"change_type\": \"tightened|relaxed|added|removed\", \"summary\": \"...\"}"
 )
 _SIMILARITY_THRESHOLD = 0.85
 def _cosine(a: list[float], b: list[float]) -> float:
    dot = sum(x * y for x, y in zip(a, b))
    norm_a = math.sqrt(sum(x * x for x in a))
    norm_b = math.sqrt(sum(x * x for x in b))
    if norm_a == 0 or norm_b == 0:
        return 0.0
    return dot / (norm_a * norm_b)
 def _llm_json(client: Any, messages: list[dict]) -> Any:
    """Call LLM and parse JSON response; return None on failure."""
    try:
        resp = client.chat(messages)
        text = (resp.content or "").strip()
        if text.startswith("```"):
            text = text.split("```")[1]
            if text.startswith("json"):
                text = text[4:]
        return json.loads(text)
    except Exception as exc:
        logger.warning("LLM JSON parse failed: {}", exc)
        return None
 class LlmPipeline:
    """Three-step enrichment pipeline for crawled regulatory events."""
    def __init__(self) -> None:
        self._client = get_llm_client(
            provider=settings.llm_provider,
            model=settings.llm_model,
        )
        self._embedder = OpenAICompatibleEmbeddingProvider()
    # ------------------------------------------------------------------
    # Step 1: Structure extraction
    # ------------------------------------------------------------------
    def extract_structure(self, event: dict) -> dict:
        """Extract obligations, deadlines, scope, penalties, impact_level from event text."""
        prompt = f"""Extract structured compliance information from this regulation:
 Standard: {event.get('standard_code', '')}
 Title: {event.get('title', '')}
 Source: {event.get('source_label', '')}
 Summary: {event.get('summary', '')}
 Tags: {', '.join(event.get('tags') or [])}
 Return JSON with exactly these keys:
 {{
  "obligations": [{{"text": "...", "deontic": "must|shall|may|prohibited", "subject": "...", "object": "...", "condition": ""}}],
  "deadlines": [{{"date": "YYYY-MM-DD or null", "description": "..."}}],
  "scope": "one sentence describing who/what this applies to",
  "penalties": "one sentence on consequences of non-compliance, or null",
  "impact_level": "high|medium|low"
 }}"""
        messages = [
            {"role": "system", "content": _EXTRACT_SYSTEM},
            {"role": "user", "content": prompt},
        ]
        result = _llm_json(self._client, messages)
        if not isinstance(result, dict):
            return {
                "obligations": [],
                "deadlines": [],
                "scope": "",
                "penalties": "",
                "impact_level": "medium",
            }
        return result
    # ------------------------------------------------------------------
    # Step 2: Impact assessment
    # ------------------------------------------------------------------
    def assess_impact(self, event: dict, retrieval_service: Any) -> list[dict]:
        """Use RAG to find affected documents and generate recommendations."""
        obligations = event.get("obligations") or []
        obligation_texts = " ".join(o.get("text", "") for o in obligations[:3])
        query = f"{event.get('standard_code', '')} {event.get('title', '')} {obligation_texts}"
        try:
            chunks = retrieval_service.retrieve(query=query, top_k=5)
        except Exception as exc:
            logger.warning("RAG retrieval failed: {}", exc)
            return []
        if not chunks:
            return []
        seen: set[str] = set()
        doc_excerpts: list[dict] = []
        for chunk in chunks:
            if chunk.doc_id not in seen:
                seen.add(chunk.doc_id)
                doc_excerpts.append({
                    "doc_id": chunk.doc_id,
                    "doc_name": chunk.doc_title,
                    "score": round(float(chunk.score if chunk.score is not None else 0), 4),
                    "snippet": (chunk.text or "")[:300],
                    "clause": getattr(chunk, "section_title", "") or "",
                })
        context = "\n".join(
            f"[{d['doc_name']} {d['clause']}] score={d['score']}: {d['snippet']}"
            for d in doc_excerpts
        )
        prompt = f"""Regulation: {event.get('standard_code')} — {event.get('title')}
 Obligations: {obligation_texts or event.get('summary', '')}
 Affected documents found in knowledge base:
 {context}
 For each document, assess impact and recommend action. Return JSON array:
 [{{"doc_id":"...","doc_name":"...","score":0.0,"key_clauses":"...","recommendation":"one sentence action"}}]"""
        messages = [
            {"role": "system", "content": _ASSESS_SYSTEM},
            {"role": "user", "content": prompt},
        ]
        result = _llm_json(self._client, messages)
        if isinstance(result, list):
            score_map = {d["doc_id"]: d["score"] for d in doc_excerpts}
            for item in result:
                if isinstance(item, dict) and item.get("doc_id") in score_map:
                    item["score"] = score_map[item["doc_id"]]
            return result
        return doc_excerpts
    # ------------------------------------------------------------------
    # Step 3: Semantic diff
    # ------------------------------------------------------------------
    def compute_diff(self, old_text: str, new_text: str) -> dict:
        """Compare old and new regulation text; return changed sections and summary."""
        old_paras = [p.strip() for p in old_text.split("\n") if p.strip()]
        new_paras = [p.strip() for p in new_text.split("\n") if p.strip()]
        if not old_paras or not new_paras:
            return {"changed_sections": [], "change_summary": "No comparable text."}
        all_paras = old_paras + new_paras
        try:
            all_embeddings = self._embedder.embed_texts(all_paras)
        except Exception as exc:
            logger.warning("Embedding for diff failed: {}", exc)
            return {"changed_sections": [], "change_summary": "Diff unavailable (embedding error)."}
        old_embeddings = all_embeddings[: len(old_paras)]
        new_embeddings = all_embeddings[len(old_paras):]
        changed_sections: list[dict] = []
        max_len = max(len(old_paras), len(new_paras))
        for i in range(max_len):
            if i >= len(old_paras):
                # New paragraph added
                changed_sections.append({
                    "old_text": "",
                    "new_text": new_paras[i][:300],
                    "similarity": 0.0,
                    "change_type": "added",
                    "summary": "New paragraph added.",
                })
                continue
            if i >= len(new_paras):
                # Old paragraph removed
                changed_sections.append({
                    "old_text": old_paras[i][:300],
                    "new_text": "",
                    "similarity": 0.0,
                    "change_type": "removed",
                    "summary": "Paragraph removed.",
                })
                continue
            # Both exist — compare via embeddings
            sim = _cosine(old_embeddings[i], new_embeddings[i])
            if sim < _SIMILARITY_THRESHOLD:
                messages = [
                    {"role": "system", "content": _DIFF_SYSTEM},
                    {"role": "user", "content": f"OLD: {old_paras[i][:500]}\nNEW: {new_paras[i][:500]}"},
                ]
                classification = _llm_json(self._client, messages) or {}
                changed_sections.append({
                    "old_text": old_paras[i][:300],
                    "new_text": new_paras[i][:300],
                    "similarity": round(sim, 3),
                    "change_type": classification.get("change_type", "modified"),
                    "summary": classification.get("summary", ""),
                })
        if not changed_sections:
            change_summary = "No substantive changes detected between versions."
        else:
            types = [s["change_type"] for s in changed_sections]
            change_summary = (
                f"{len(changed_sections)} paragraph(s) changed: "
                + ", ".join(f"{t}" for t in set(types))
                + ". "
                + (changed_sections[0].get("summary", "") if changed_sections else "")
            )
        return {"changed_sections": changed_sections, "change_summary": change_summary}
--- a/backend/app/infrastructure/perception/mock_event_store.py
+++ b/backend/app/infrastructure/perception/mock_event_store.py
@@ -4,6 +4,8 @@ from __future__ import annotations
 from typing import Any
 from app.infrastructure.perception.base_event_store import BaseEventStore
 MOCK_EVENTS: list[dict[str, Any]] = [
    # ------------------------------------------------------------------ HIGH
    {
@@ -379,18 +381,18 @@ MOCK_EVENTS: list[dict[str, Any]] = [
    },
 ]
-# Index for fast lookup
+class MockEventStore(BaseEventStore):
 _EVENT_INDEX: dict[str, dict] = {e["id"]: e for e in MOCK_EVENTS}
 class MockEventStore:
    """In-memory mock store for regulatory events."""
    def __init__(self) -> None:
        self._events: list[dict] = [dict(e) for e in MOCK_EVENTS]
        self._index: dict[str, dict] = {e["id"]: e for e in self._events}
    def all(self) -> list[dict]:
-        return list(MOCK_EVENTS)
+        return list(self._events)
    def get(self, event_id: str) -> dict | None:
-        return _EVENT_INDEX.get(event_id)
+        return self._index.get(event_id)
    def filter(
        self,
@@ -399,23 +401,39 @@ class MockEventStore:
        impact_level: str | None = None,
        limit: int = 50,
    ) -> list[dict]:
-        events = list(MOCK_EVENTS)
+        events = list(self._events)
        if source:
            events = [e for e in events if e["source"] == source]
        if impact_level:
            events = [e for e in events if e["impact_level"] == impact_level]
-        events.sort(key=lambda e: e["published_at"], reverse=True)
+        events.sort(key=lambda e: e.get("published_at") or "", reverse=True)
        return events[:limit]
    def stats(self) -> dict:
        from datetime import date, timedelta
-        events = MOCK_EVENTS
+        events = self._events
        cutoff = (date.today() - timedelta(days=90)).isoformat()
        return {
            "total": len(events),
            "high_impact": sum(1 for e in events if e["impact_level"] == "high"),
            "medium_impact": sum(1 for e in events if e["impact_level"] == "medium"),
            "low_impact": sum(1 for e in events if e["impact_level"] == "low"),
-            "recent_90d": sum(1 for e in events if e["published_at"] >= cutoff),
+            "recent_90d": sum(1 for e in events if (e.get("published_at") or "") >= cutoff),
        }
    def upsert(self, event: dict) -> None:
        """Insert or update event in the in-memory list (used in tests)."""
        existing = self._index.get(event["id"])
        if existing:
            existing.update(event)
        else:
            self._events.append(event)
            self._index[event["id"]] = event
    def get_by_standard_code(self, standard_code: str) -> dict | None:
        """Return most-recent event with matching standard_code."""
        matches = [e for e in self._events if e.get("standard_code") == standard_code]
        if not matches:
            return None
        return max(matches, key=lambda e: e.get("published_at", ""))
--- a/backend/app/infrastructure/perception/postgres_event_store.py
+++ b/backend/app/infrastructure/perception/postgres_event_store.py
@@ -0,0 +1,225 @@
 """PostgreSQL-backed regulatory event store."""
 from __future__ import annotations
 import json
 from contextlib import contextmanager
 from datetime import UTC, date, datetime, timedelta
 from typing import Any
 import psycopg2
 import psycopg2.extras
 from psycopg2.pool import ThreadedConnectionPool
 from app.config.settings import settings
 from app.infrastructure.perception.base_event_store import BaseEventStore
 _CREATE_TABLE = """
 CREATE TABLE IF NOT EXISTS regulation_events (
    id               TEXT PRIMARY KEY,
    source           TEXT NOT NULL,
    source_label     TEXT,
    standard_code    TEXT NOT NULL,
    title            TEXT NOT NULL,
    summary          TEXT,
    full_text_url    TEXT,
    status           TEXT,
    impact_level     TEXT,
    published_at     DATE,
    effective_at     DATE,
    category         TEXT,
    tags             TEXT[],
    obligations      JSONB,
    deadlines        JSONB,
    scope            TEXT,
    penalties        TEXT,
    content_hash     TEXT,
    previous_hash    TEXT,
    change_summary   TEXT,
    changed_sections JSONB,
    affected_docs    JSONB,
    crawled_at       TIMESTAMPTZ DEFAULT now(),
    processed_at     TIMESTAMPTZ,
    raw_storage_key  TEXT
 );
 CREATE INDEX IF NOT EXISTS reg_events_source_date
    ON regulation_events (source, published_at DESC);
 CREATE INDEX IF NOT EXISTS reg_events_impact_date
    ON regulation_events (impact_level, published_at DESC);
 """
 _ALL_COLUMNS = (
    "id", "source", "source_label", "standard_code", "title", "summary",
    "full_text_url", "status", "impact_level", "published_at", "effective_at",
    "category", "tags", "obligations", "deadlines", "scope", "penalties",
    "content_hash", "previous_hash", "change_summary", "changed_sections",
    "affected_docs", "crawled_at", "processed_at", "raw_storage_key",
 )
 def _row_to_dict(row: dict[str, Any]) -> dict:
    """Convert a psycopg2 RealDictRow to a plain dict with serialized JSON fields."""
    d = dict(row)
    for field in ("obligations", "deadlines", "changed_sections", "affected_docs"):
        val = d.get(field)
        if isinstance(val, str):
            d[field] = json.loads(val)
    for date_field in ("published_at", "effective_at"):
        val = d.get(date_field)
        if isinstance(val, datetime):
            d[date_field] = val.date().isoformat()
        elif isinstance(val, date):
            d[date_field] = val.isoformat()
    for ts_field in ("crawled_at", "processed_at"):
        val = d.get(ts_field)
        if isinstance(val, datetime):
            d[ts_field] = val.isoformat()
    return d
 class PostgresEventStore(BaseEventStore):
    """Regulatory event store backed by PostgreSQL."""
    def __init__(self) -> None:
        self._pool = ThreadedConnectionPool(
            minconn=1,
            maxconn=5,
            host=settings.postgres_host,
            port=settings.postgres_port,
            user=settings.postgres_user,
            password=settings.postgres_password,
            dbname=settings.postgres_db,
        )
        self._ensure_schema()
    def _ensure_schema(self) -> None:
        with self._conn() as conn:
            try:
                with conn.cursor() as cur:
                    cur.execute(_CREATE_TABLE)
                conn.commit()
            except Exception:
                conn.rollback()
                raise
    @contextmanager
    def _conn(self):
        conn = None
        try:
            conn = self._pool.getconn()
            yield conn
        finally:
            if conn is not None:
                self._pool.putconn(conn)
    def all(self) -> list[dict]:
        with self._conn() as conn:
            with conn.cursor(cursor_factory=psycopg2.extras.RealDictCursor) as cur:
                cur.execute(
                    "SELECT * FROM regulation_events ORDER BY published_at DESC NULLS LAST"
                )
                return [_row_to_dict(r) for r in cur.fetchall()]
    def get(self, event_id: str) -> dict | None:
        with self._conn() as conn:
            with conn.cursor(cursor_factory=psycopg2.extras.RealDictCursor) as cur:
                cur.execute(
                    "SELECT * FROM regulation_events WHERE id = %s", (event_id,)
                )
                row = cur.fetchone()
                return _row_to_dict(row) if row else None
    def filter(
        self,
        *,
        source: str | None = None,
        impact_level: str | None = None,
        limit: int = 50,
    ) -> list[dict]:
        conditions: list[str] = []
        params: list[Any] = []
        if source:
            conditions.append("source = %s")
            params.append(source)
        if impact_level:
            conditions.append("impact_level = %s")
            params.append(impact_level)
        where = ("WHERE " + " AND ".join(conditions)) if conditions else ""
        params.append(limit)
        sql = f"""
            SELECT * FROM regulation_events
            {where}
            ORDER BY published_at DESC NULLS LAST
            LIMIT %s
        """
        with self._conn() as conn:
            with conn.cursor(cursor_factory=psycopg2.extras.RealDictCursor) as cur:
                cur.execute(sql, params)
                return [_row_to_dict(r) for r in cur.fetchall()]
    def stats(self) -> dict:
        cutoff = (date.today() - timedelta(days=90)).isoformat()
        with self._conn() as conn:
            with conn.cursor(cursor_factory=psycopg2.extras.RealDictCursor) as cur:
                cur.execute("SELECT COUNT(*) AS count FROM regulation_events")
                total = (cur.fetchone() or {}).get("count", 0)
                cur.execute(
                    "SELECT COUNT(*) AS count FROM regulation_events WHERE impact_level = 'high'"
                )
                high = (cur.fetchone() or {}).get("count", 0)
                cur.execute(
                    "SELECT COUNT(*) AS count FROM regulation_events WHERE impact_level = 'medium'"
                )
                medium = (cur.fetchone() or {}).get("count", 0)
                cur.execute(
                    "SELECT COUNT(*) AS count FROM regulation_events WHERE published_at >= %s",
                    (cutoff,),
                )
                recent = (cur.fetchone() or {}).get("count", 0)
        return {
            "total": int(total),
            "high_impact": int(high),
            "medium_impact": int(medium),
            "recent_90d": int(recent),
        }
    def upsert(self, event: dict) -> None:
        """Insert or update a regulation event."""
        cols = [c for c in _ALL_COLUMNS if c in event]
        placeholders = ", ".join(f"%({c})s" for c in cols)
        updates = ", ".join(f"{c} = EXCLUDED.{c}" for c in cols if c != "id")
        sql = f"""
            INSERT INTO regulation_events ({', '.join(cols)})
            VALUES ({placeholders})
            ON CONFLICT (id) DO UPDATE SET {updates}
        """
        row: dict[str, Any] = {}
        for c in cols:
            val = event.get(c)
            if c in ("obligations", "deadlines", "changed_sections", "affected_docs") and val is not None:
                row[c] = json.dumps(val, ensure_ascii=False)
            elif c == "tags" and isinstance(val, list):
                row[c] = val
            else:
                row[c] = val
        with self._conn() as conn:
            try:
                with conn.cursor() as cur:
                    cur.execute(sql, row)
                conn.commit()
            except Exception:
                conn.rollback()
                raise
    def get_by_standard_code(self, standard_code: str) -> dict | None:
        with self._conn() as conn:
            with conn.cursor(cursor_factory=psycopg2.extras.RealDictCursor) as cur:
                cur.execute(
                    """SELECT * FROM regulation_events
                       WHERE standard_code = %s
                       ORDER BY published_at DESC NULLS LAST
                       LIMIT 1""",
                    (standard_code,),
                )
                row = cur.fetchone()
                return _row_to_dict(row) if row else None
--- a/backend/app/shared/bootstrap.py
+++ b/backend/app/shared/bootstrap.py
@@ -19,6 +19,15 @@ from app.infrastructure.parser.local_chunk_builder import LocalRegulationChunkBu
 from app.infrastructure.parser.local_document_parser import LocalDocumentParser
 from app.infrastructure.parser.vector_chunk_builder import AliyunVectorChunkBuilder
 from app.infrastructure.perception.mock_event_store import MockEventStore
 from app.application.perception.crawl_service import CrawlService
 from app.infrastructure.perception.base_event_store import BaseEventStore
 from app.infrastructure.perception.crawlers.catarc_crawler import CatarcCrawler
 from app.infrastructure.perception.crawlers.guobiao_crawler import (
    GuobiaoMandatoryCrawler,
    GuobiaoRecommendedCrawler,
 )
 from app.infrastructure.perception.crawlers.eurlex_crawler import EurlexCrawler
 from app.infrastructure.perception.llm_pipeline import LlmPipeline
 from app.infrastructure.session.in_memory_conversation_store import InMemoryConversationStore
 from app.infrastructure.storage.json_document_processing_store import JsonDocumentProcessingStore
 from app.infrastructure.storage.json_document_repository import JsonDocumentRepository
@@ -293,11 +302,35 @@ def get_agent_conversation_service() -> AgentConversationService:
    )
@lru_cache
 def get_event_store() -> BaseEventStore:
    """Return event store selected by DOCUMENT_REPOSITORY_BACKEND setting."""
    if settings.document_repository_backend == "postgres":
        from app.infrastructure.perception.postgres_event_store import PostgresEventStore
        return PostgresEventStore()
    return MockEventStore()
@lru_cache
 def get_perception_service() -> PerceptionService:
    """Return perception service for regulatory intelligence."""
    return PerceptionService(
-        event_store=MockEventStore(),
+        event_store=get_event_store(),
        retrieval_service=get_retrieval_service(),
    )
@lru_cache
 def get_crawl_service() -> CrawlService:
    crawlers = {
        "CATARC": CatarcCrawler(),
        "国标委·强制性": GuobiaoMandatoryCrawler(),
        "国标委·推荐性": GuobiaoRecommendedCrawler(),
        "EUR-Lex": EurlexCrawler(),
    }
    return CrawlService(
        crawlers=crawlers,
        event_store=get_event_store(),
        llm_pipeline=LlmPipeline(),
        retrieval_service=get_retrieval_service(),
    )
--- a/backend/requirements.txt
+++ b/backend/requirements.txt
@@ -9,6 +9,8 @@ pydantic-settings>=2.0.0
 python-dotenv>=1.0.0
 loguru>=0.7.0
 httpx>=0.25.0
 beautifulsoup4>=4.12.0
 lxml>=5.0.0
 tiktoken>=0.5.0
 tenacity>=8.2.0
--- a/backend/tests/perception/init.py
+++ b/backend/tests/perception/init.py
--- a/backend/tests/perception/test_base_event_store.py
+++ b/backend/tests/perception/test_base_event_store.py
@@ -0,0 +1,95 @@
 """Contract tests: any BaseEventStore implementation must pass these."""
 from app.infrastructure.perception.base_event_store import BaseEventStore
 from app.infrastructure.perception.mock_event_store import MockEventStore
 def _store() -> BaseEventStore:
    return MockEventStore()
 def test_is_base_event_store():
    assert isinstance(_store(), BaseEventStore)
 def test_all_returns_list():
    result = _store().all()
    assert isinstance(result, list)
    assert len(result) > 0
 def test_get_known_id():
    store = _store()
    first = store.all()[0]
    result = store.get(first["id"])
    assert result is not None
    assert result["id"] == first["id"]
 def test_get_unknown_returns_none():
    assert _store().get("does-not-exist") is None
 def test_filter_by_impact():
    store = _store()
    highs = store.filter(impact_level="high", limit=100)
    assert all(e["impact_level"] == "high" for e in highs)
 def test_filter_limit():
    store = _store()
    result = store.filter(limit=3)
    assert len(result) <= 3
 def test_stats_keys():
    stats = _store().stats()
    for key in ("total", "high_impact", "medium_impact", "recent_90d"):
        assert key in stats, f"missing key: {key}"
 def test_upsert_and_get():
    store = _store()
    event = {
        "id": "test-upsert-001",
        "source": "TEST",
        "source_label": "Test Source",
        "standard_code": "TST-001",
        "title": "Test Event",
        "summary": "A test event",
        "full_text_url": "https://example.com",
        "status": "draft",
        "impact_level": "low",
        "published_at": "2026-01-01",
        "effective_at": None,
        "category": "test",
        "tags": ["test"],
        "content_hash": "abc123",
        "previous_hash": None,
    }
    store.upsert(event)
    result = store.get("test-upsert-001")
    assert result is not None
    assert result["title"] == "Test Event"
 def test_get_by_standard_code():
    store = _store()
    first = store.all()[0]
    result = store.get_by_standard_code(first["standard_code"])
    assert result is not None
    assert result["standard_code"] == first["standard_code"]
 def test_upsert_updates_existing():
    store = _store()
    first = store.all()[0]
    original_id = first["id"]
    store.upsert({"id": original_id, "title": "Updated Title", "impact_level": first["impact_level"],
                  "standard_code": first.get("standard_code", ""), "source": first["source"],
                  "source_label": first.get("source_label", ""), "summary": "Updated",
                  "full_text_url": "", "status": first["status"], "published_at": first.get("published_at", ""),
                  "effective_at": None, "category": first.get("category", ""), "tags": [],
                  "content_hash": "newhash", "previous_hash": None})
    result = store.get(original_id)
    assert result is not None
    assert result["title"] == "Updated Title"
--- a/backend/tests/perception/test_crawl_service.py
+++ b/backend/tests/perception/test_crawl_service.py
@@ -0,0 +1,111 @@
 """Integration tests for CrawlService."""
 from __future__ import annotations
 from unittest.mock import MagicMock
 import hashlib
 import pytest
 from app.infrastructure.perception.crawlers.base import RawEvent
 from app.infrastructure.perception.mock_event_store import MockEventStore
 def _make_raw_event(code="TST-001"):
    return RawEvent(
        source="TEST", source_label="Test", standard_code=code,
        title=f"Test {code}", summary="Summary", full_text_url="https://example.com",
        status="enacted", published_at="2026-01-01", effective_at=None,
        category="test", tags=["test"], raw_text="full text",
    )
 def _make_service(raw_events):
    from app.application.perception.crawl_service import CrawlService
    mock_crawler = MagicMock()
    mock_crawler.fetch.return_value = raw_events
    mock_pipeline = MagicMock()
    mock_pipeline.extract_structure.return_value = {
        "obligations": [], "deadlines": [], "scope": "test",
        "penalties": None, "impact_level": "low",
    }
    mock_pipeline.assess_impact.return_value = []
    mock_pipeline.compute_diff.return_value = {
        "changed_sections": [], "change_summary": "No changes.",
    }
    mock_retrieval = MagicMock()
    store = MockEventStore()
    return CrawlService(
        crawlers={"TEST": mock_crawler},
        event_store=store,
        llm_pipeline=mock_pipeline,
        retrieval_service=mock_retrieval,
    )
 def test_crawl_yields_progress_and_done():
    svc = _make_service([_make_raw_event("TST-001")])
    events = list(svc.run_crawl())
    event_types = [e.get("event") for e in events]
    assert "done" in event_types
 def test_crawl_upserts_to_store():
    store = MockEventStore()
    from app.application.perception.crawl_service import CrawlService
    mock_crawler = MagicMock()
    mock_crawler.fetch.return_value = [_make_raw_event("NEW-001")]
    mock_pipeline = MagicMock()
    mock_pipeline.extract_structure.return_value = {
        "obligations": [], "deadlines": [], "scope": "",
        "penalties": None, "impact_level": "medium",
    }
    mock_pipeline.assess_impact.return_value = []
    mock_pipeline.compute_diff.return_value = {
        "changed_sections": [], "change_summary": "",
    }
    svc = CrawlService(
        crawlers={"TEST": mock_crawler},
        event_store=store,
        llm_pipeline=mock_pipeline,
        retrieval_service=MagicMock(),
    )
    list(svc.run_crawl())
    result = store.get_by_standard_code("NEW-001")
    assert result is not None
    assert result["title"] == "Test NEW-001"
 def test_crawl_skips_unchanged_events():
    store = MockEventStore()
    raw = _make_raw_event("SKIP-001")
    content_hash = hashlib.sha256(raw.raw_text.encode()).hexdigest()
    store.upsert({
        "id": hashlib.sha256(f"TEST-SKIP-001".encode()).hexdigest()[:12],
        "standard_code": "SKIP-001",
        "source": "TEST",
        "source_label": "Test",
        "title": "Test SKIP-001",
        "summary": "",
        "full_text_url": "",
        "status": "enacted",
        "impact_level": "low",
        "published_at": "2026-01-01",
        "effective_at": None,
        "category": "test",
        "tags": [],
        "content_hash": content_hash,
    })
    mock_pipeline = MagicMock()
    from app.application.perception.crawl_service import CrawlService
    mock_crawler = MagicMock()
    mock_crawler.fetch.return_value = [raw]
    svc = CrawlService(
        crawlers={"TEST": mock_crawler},
        event_store=store,
        llm_pipeline=mock_pipeline,
        retrieval_service=MagicMock(),
    )
    list(svc.run_crawl())
    mock_pipeline.extract_structure.assert_not_called()
--- a/backend/tests/perception/test_crawlers.py
+++ b/backend/tests/perception/test_crawlers.py
@@ -0,0 +1,127 @@
 """Unit tests for crawlers — mock httpx responses."""
 from __future__ import annotations
 from unittest.mock import MagicMock, patch
 import pytest
 from app.infrastructure.perception.crawlers.base import RawEvent, BaseCrawler
 def test_raw_event_fields():
    ev = RawEvent(
        source="TEST",
        source_label="Test",
        standard_code="TST-001",
        title="Test",
        summary="Summary",
        full_text_url="https://example.com",
        status="enacted",
        published_at="2026-01-01",
        effective_at=None,
        category="test",
        tags=["a"],
        raw_text="full text here",
    )
    assert ev.source == "TEST"
    assert ev.tags == ["a"]
 CATARC_HTML = """
 <html><body>
 <table>
 <tr>
  <td><a href="/std/detail/123">GB 18384-2025</a></td>
  <td>电动汽车安全要求</td>
  <td>2025-11-15</td>
  <td>现行</td>
 </tr>
 <tr>
  <td><a href="/std/detail/456">GB/T 40429-2026</a></td>
  <td>汽车驾驶自动化分级</td>
  <td>2026-02-01</td>
  <td>即将实施</td>
 </tr>
 </table>
 </body></html>
 """
 def test_catarc_crawler_parses_html():
    from app.infrastructure.perception.crawlers.catarc_crawler import CatarcCrawler
    mock_resp = MagicMock()
    mock_resp.status_code = 200
    mock_resp.text = CATARC_HTML
    mock_resp.raise_for_status = MagicMock()
    with patch("httpx.get", return_value=mock_resp):
        crawler = CatarcCrawler()
        events = crawler.fetch(limit=10)
    assert isinstance(events, list)
    assert len(events) >= 1
    assert all(isinstance(e, RawEvent) for e in events)
    codes = [e.standard_code for e in events]
    assert "GB 18384-2025" in codes
 GUOBIAO_JSON = {
    "rows": [
        {
            "std_code": "GB 18384-2025",
            "std_name": "电动汽车安全要求",
            "release_date": "2025-11-15",
            "implement_date": "2026-07-01",
            "std_status": "现行",
            "std_type": "强制性",
        },
    ]
 }
 def test_guobiao_crawler_parses_json():
    from app.infrastructure.perception.crawlers.guobiao_crawler import GuobiaoMandatoryCrawler
    mock_resp = MagicMock()
    mock_resp.status_code = 200
    mock_resp.json.return_value = GUOBIAO_JSON
    mock_resp.raise_for_status = MagicMock()
    with patch("httpx.get", return_value=mock_resp):
        crawler = GuobiaoMandatoryCrawler()
        events = crawler.fetch(limit=10)
    assert len(events) >= 1
    assert events[0].source == "国标委"
    assert events[0].standard_code == "GB 18384-2025"
 EURLEX_RSS = """<?xml version="1.0" encoding="UTF-8"?>
 <rss version="2.0">
  <channel>
    <title>EUR-Lex</title>
    <item>
      <title>Regulation (EU) 2024/1689 — AI Act</title>
      <link>https://eur-lex.europa.eu/legal-content/EN/TXT/?uri=CELEX:32024R1689</link>
      <description>The EU Artificial Intelligence Act enters into force.</description>
      <pubDate>Fri, 12 Jul 2024 00:00:00 GMT</pubDate>
    </item>
  </channel>
 </rss>"""
 def test_eurlex_crawler_parses_rss():
    from app.infrastructure.perception.crawlers.eurlex_crawler import EurlexCrawler
    mock_resp = MagicMock()
    mock_resp.status_code = 200
    mock_resp.text = EURLEX_RSS
    mock_resp.content = EURLEX_RSS
    mock_resp.raise_for_status = MagicMock()
    with patch("httpx.get", return_value=mock_resp):
        crawler = EurlexCrawler()
        events = crawler.fetch(limit=5)
    assert isinstance(events, list)
    assert len(events) >= 1
    assert events[0].source == "EUR-Lex"
--- a/backend/tests/perception/test_llm_pipeline.py
+++ b/backend/tests/perception/test_llm_pipeline.py
@@ -0,0 +1,77 @@
 """Unit tests for LlmPipeline — mock LLM client and embedding provider."""
 from __future__ import annotations
 from unittest.mock import MagicMock, patch
 import json
 import pytest
 def _make_pipeline():
    with patch("app.infrastructure.perception.llm_pipeline.get_llm_client") as mock_llm_fn, \
         patch("app.infrastructure.perception.llm_pipeline.OpenAICompatibleEmbeddingProvider") as mock_emb_cls:
        mock_client = MagicMock()
        mock_client.chat.return_value = MagicMock(content='{"obligations":[{"text":"test obligation","deontic":"must","subject":"OEM","object":"system","condition":""}],"deadlines":[{"date":"2026-07-01","description":"实施截止"}],"scope":"适用于M1类车辆","penalties":"罚款","impact_level":"high"}')
        mock_llm_fn.return_value = mock_client
        mock_emb = MagicMock()
        mock_emb.embed_texts.return_value = [[0.1] * 1024, [0.9] * 1024]
        mock_emb_cls.return_value = mock_emb
        from app.infrastructure.perception.llm_pipeline import LlmPipeline
        return LlmPipeline(), mock_client, mock_emb
 def test_extract_structure_returns_dict():
    pipeline, mock_client, _ = _make_pipeline()
    event = {
        "id": "evt-001",
        "standard_code": "GB 18384-2025",
        "title": "电动汽车安全要求",
        "summary": "新增 IP67 级别防护",
        "source_label": "CATARC",
        "tags": ["电池安全"],
    }
    result = pipeline.extract_structure(event)
    assert isinstance(result, dict)
    assert "obligations" in result
    assert "impact_level" in result
 def test_assess_impact_returns_list():
    pipeline, mock_client, _ = _make_pipeline()
    mock_client.chat.return_value = MagicMock(content='[{"doc_id":"d1","doc_name":"Safety Manual","score":0.85,"key_clauses":"§4.2","recommendation":"更新第4章"}]')
    mock_retrieval = MagicMock()
    chunk = MagicMock()
    chunk.doc_id = "d1"
    chunk.doc_title = "Safety Manual"
    chunk.score = 0.85
    chunk.text = "relevant text"
    chunk.section_title = "§4.2"
    mock_retrieval.retrieve.return_value = [chunk]
    event = {
        "standard_code": "GB 18384-2025",
        "title": "电动汽车安全要求",
        "obligations": [{"text": "OEM shall comply"}],
    }
    result = pipeline.assess_impact(event, mock_retrieval)
    assert isinstance(result, list)
 def test_compute_diff_no_change():
    pipeline, _, mock_emb = _make_pipeline()
    mock_emb.embed_texts.return_value = [[0.5] * 1024, [0.5] * 1024]
    result = pipeline.compute_diff("paragraph one", "paragraph one")
    assert isinstance(result, dict)
    assert "changed_sections" in result
    assert "change_summary" in result
 def test_compute_diff_detects_change():
    pipeline, mock_client, mock_emb = _make_pipeline()
    mock_emb.embed_texts.return_value = [
        [1.0] + [0.0] * 1023,
        [0.0] + [1.0] + [0.0] * 1022,
    ]
    mock_client.chat.return_value = MagicMock(content='{"change_type":"tightened","summary":"Requirement tightened"}')
    result = pipeline.compute_diff("old paragraph text", "new tighter requirement text")
    assert isinstance(result["changed_sections"], list)
--- a/backend/tests/perception/test_postgres_event_store.py
+++ b/backend/tests/perception/test_postgres_event_store.py
@@ -0,0 +1,98 @@
 """Unit tests for PostgresEventStore using a mocked psycopg2 pool."""
 from __future__ import annotations
 import json
 from unittest.mock import MagicMock, patch
 import pytest
 # Patch psycopg2 before importing the module under test
 import sys
 mock_psycopg2 = MagicMock()
 mock_psycopg2.extras = MagicMock()
 sys.modules.setdefault("psycopg2", mock_psycopg2)
 sys.modules.setdefault("psycopg2.extras", mock_psycopg2.extras)
 sys.modules.setdefault("psycopg2.pool", MagicMock())
 from app.infrastructure.perception.base_event_store import BaseEventStore
 SAMPLE_ROW = {
    "id": "pg-001",
    "source": "国标委",
    "source_label": "国家标准化管理委员会",
    "standard_code": "GB 18384-2025",
    "title": "电动汽车安全要求",
    "summary": "新增要求",
    "full_text_url": "https://openstd.samr.gov.cn",
    "status": "enacted",
    "impact_level": "high",
    "published_at": "2025-11-15",
    "effective_at": "2026-07-01",
    "category": "电动汽车安全",
    "tags": ["电池安全"],
    "obligations": None,
    "deadlines": None,
    "scope": None,
    "penalties": None,
    "content_hash": "abc123",
    "previous_hash": None,
    "change_summary": None,
    "changed_sections": None,
    "affected_docs": None,
    "crawled_at": "2026-06-05T10:00:00+00:00",
    "processed_at": None,
    "raw_storage_key": None,
 }
 def _make_store_with_pool(mock_pool):
    with patch("psycopg2.pool.ThreadedConnectionPool", return_value=mock_pool):
        with patch(
            "app.infrastructure.perception.postgres_event_store.PostgresEventStore._ensure_schema"
        ):
            from app.infrastructure.perception.postgres_event_store import PostgresEventStore
            return PostgresEventStore()
 def _cursor_returning(rows):
    cursor = MagicMock()
    cursor.__enter__ = lambda s: s
    cursor.__exit__ = MagicMock(return_value=False)
    cursor.fetchall.return_value = rows
    cursor.fetchone.return_value = rows[0] if rows else None
    return cursor
 def test_is_base_event_store():
    mock_pool = MagicMock()
    store = _make_store_with_pool(mock_pool)
    assert isinstance(store, BaseEventStore)
 def test_filter_returns_list():
    mock_pool = MagicMock()
    conn = MagicMock()
    conn.__enter__ = lambda s: s
    conn.__exit__ = MagicMock(return_value=False)
    cursor = _cursor_returning([SAMPLE_ROW])
    conn.cursor.return_value = cursor
    mock_pool.getconn.return_value = conn
    store = _make_store_with_pool(mock_pool)
    result = store.filter(limit=10)
    assert isinstance(result, list)
 def test_stats_returns_correct_keys():
    mock_pool = MagicMock()
    conn = MagicMock()
    conn.__enter__ = lambda s: s
    conn.__exit__ = MagicMock(return_value=False)
    cursor = MagicMock()
    cursor.__enter__ = lambda s: s
    cursor.__exit__ = MagicMock(return_value=False)
    cursor.fetchone.return_value = {"count": 5}
    conn.cursor.return_value = cursor
    mock_pool.getconn.return_value = conn
    store = _make_store_with_pool(mock_pool)
    stats = store.stats()
    for key in ("total", "high_impact", "medium_impact", "recent_90d"):
        assert key in stats
--- a/docs/superpowers/plans/2026-06-05-perception-intelligence.md
+++ b/docs/superpowers/plans/2026-06-05-perception-intelligence.md
--- a/docs/superpowers/specs/2026-06-05-perception-intelligence-design.md
+++ b/docs/superpowers/specs/2026-06-05-perception-intelligence-design.md
@@ -0,0 +1,328 @@
 # Regulatory Signals Intelligence Enhancement — Design Spec
 > **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking.
 **Goal:** Replace the 20-item hardcoded MockEventStore with real regulatory data from Chinese and international sources, add LLM-driven structured extraction, impact assessment, and semantic change diff — all accessible through a manual-trigger crawl in the frontend.
 **Architecture:** Crawler Service (httpx + BeautifulSoup) → PostgreSQL EventStore → LLM Pipeline (extract → assess → diff) → existing PerceptionService interface. New code follows `api → application → domain ports → infrastructure` layering; no new files in `services/*` or `workflows/*`; `shared/bootstrap.py` is the composition root.
 **Tech Stack:** httpx, BeautifulSoup4, sentence-transformers (for diff), existing LLM factory (deepseek/qwen), existing KnowledgeRetrievalService (RAG), PostgreSQL (already available), existing SSE infrastructure.
 ---
 ## 1. Data Sources
 | Source | URL | Method | Coverage |
 |--------|-----|--------|----------|
 | CATARC 汽车标准 | `https://www.catarc.org.cn/bzzxd/qcbz/index.html` | httpx + BeautifulSoup (static pages) | 国家/行业汽车标准列表 |
 | 国标委强制性标准 | `https://openstd.samr.gov.cn/bzgk/std/std_list_type?p.p1=1&p.p2=车&p.p90=circulation_date&p.p91=desc` | httpx + JSON API parse | 强制性国家标准，按"车"过滤 |
 | 国标委推荐性标准 | `https://openstd.samr.gov.cn/bzgk/std/std_list_type?p.p1=2&p.p2=车&p.p90=circulation_date&p.p91=desc` | httpx + JSON API parse | 推荐性国家标准，按"车"过滤 |
 | EUR-Lex | RSS + CELLAR REST API | pyeurlex / httpx | EU AI Act, automotive directives |
 | UN R155/R156 | CELLAR REST API (CELEX lookup) | httpx | UN-ECE cybersecurity/OTA regulations |
 Crawl is **manual-trigger only** — no cron/Celery Beat. Admin clicks "刷新数据源" in the frontend UI.
 ---
 ## 2. Database Schema
 ### New table: `regulation_events`
 ```sql
 CREATE TABLE IF NOT EXISTS regulation_events (
    id              TEXT PRIMARY KEY,          -- sha256(source + standard_code)[:12]
    source          TEXT NOT NULL,             -- 'CATARC' | '国标委' | 'EUR-Lex' | 'UN-ECE'
    source_label    TEXT,                      -- Human-readable source label
    standard_code   TEXT NOT NULL,             -- e.g. "GB 18384-2025", "EU/2024/1689"
    title           TEXT NOT NULL,
    summary         TEXT,                      -- Crawled abstract or first paragraph
    full_text_url   TEXT,                      -- Original page URL
    status          TEXT,                      -- 'enacted' | 'draft' | 'consultation'
    impact_level    TEXT,                      -- 'high' | 'medium' | 'low' (LLM-assigned)
    published_at    DATE,
    effective_at    DATE,
    category        TEXT,
    tags            TEXT[],
    -- LLM structured extraction
    obligations     JSONB,       -- [{text, deontic, subject, object, condition}]
    deadlines       JSONB,       -- [{date, description}]
    scope           TEXT,        -- Applicability scope summary
    penalties       TEXT,        -- Penalty / consequence summary
    -- Change tracking
    content_hash    TEXT,        -- SHA256 of crawled full text
    previous_hash   TEXT,        -- Hash from prior crawl (NULL on first crawl)
    change_summary  TEXT,        -- LLM-generated description of changes
    changed_sections JSONB,      -- [{old_text, new_text, change_type}] where cosine<0.85
    -- Impact assessment
    affected_docs   JSONB,       -- [{doc_id, doc_name, score, key_clauses, recommendation}]
    -- Metadata
    crawled_at      TIMESTAMPTZ DEFAULT now(),
    processed_at    TIMESTAMPTZ,
    raw_storage_key TEXT         -- MinIO path for raw HTML/PDF (optional)
 );
 CREATE INDEX IF NOT EXISTS regulation_events_source_date
    ON regulation_events (source, published_at DESC);
 CREATE INDEX IF NOT EXISTS regulation_events_impact_date
    ON regulation_events (impact_level, published_at DESC);
 CREATE INDEX IF NOT EXISTS regulation_events_tags
    ON regulation_events USING gin(tags);
 ```
 ---
 ## 3. Backend Architecture
 ### 3.1 File Map
 **New files (infrastructure layer):**
 - `backend/app/infrastructure/perception/crawlers/catarc_crawler.py` — CATARC scraper
 - `backend/app/infrastructure/perception/crawlers/guobiao_crawler.py` — 国标委 JSON API crawler
 - `backend/app/infrastructure/perception/crawlers/eurlex_crawler.py` — EUR-Lex RSS + CELLAR
 - `backend/app/infrastructure/perception/crawlers/base.py` — Abstract base class
 - `backend/app/infrastructure/perception/postgres_event_store.py` — PostgresEventStore (replaces MockEventStore)
 - `backend/app/infrastructure/perception/llm_pipeline.py` — Extract / assess / diff pipeline
 **New files (application layer):**
 - `backend/app/application/perception/crawl_service.py` — Orchestrates crawlers + LLM pipeline, exposes `run_crawl(sources)` + progress generator
 **Modified files:**
 - `backend/app/api/routes/perception.py` — Add `POST /crawl`, `GET /crawl/status` (SSE), `POST /events/{id}/process`, `GET /events/{id}/diff`
 - `backend/app/shared/bootstrap.py` — Wire `PostgresEventStore` + `CrawlService` + `LlmPipeline` when `DOCUMENT_REPOSITORY_BACKEND=postgres`; fallback to `MockEventStore` when `json`
 - `backend/app/config/settings.py` — Add `perception_crawl_timeout_seconds`, `perception_max_events_per_source`
 **Unchanged files:**
 - `backend/app/application/perception/services.py` — `PerceptionService` interface unchanged; only `_store` swap
 - `backend/app/infrastructure/perception/mock_event_store.py` — Kept for `json` backend mode
 ### 3.2 Domain Port (Abstract Interface)
 ```python
 # backend/app/infrastructure/perception/base_event_store.py
 from abc import ABC, abstractmethod
 class BaseEventStore(ABC):
    @abstractmethod
    def all(self) -> list[dict]: ...
    @abstractmethod
    def get(self, event_id: str) -> dict | None: ...
    @abstractmethod
    def filter(self, source=None, impact_level=None, limit=50) -> list[dict]: ...
    @abstractmethod
    def stats(self) -> dict: ...
    @abstractmethod
    def upsert(self, event: dict) -> None: ...      # new — needed for crawl writes
    @abstractmethod
    def get_by_standard_code(self, code: str) -> dict | None: ...  # for change detection
 ```
 `MockEventStore` and `PostgresEventStore` both implement this interface.
 ### 3.3 Crawler Base Contract
 ```python
 # backend/app/infrastructure/perception/crawlers/base.py
 from abc import ABC, abstractmethod
 from dataclasses import dataclass
@dataclass
 class RawEvent:
    source: str
    source_label: str
    standard_code: str
    title: str
    summary: str
    full_text_url: str
    status: str           # 'enacted' | 'draft' | 'consultation'
    published_at: str     # YYYY-MM-DD string
    effective_at: str | None
    category: str
    tags: list[str]
    raw_text: str         # full crawled text for hashing + LLM
 class BaseCrawler(ABC):
    @abstractmethod
    def fetch(self, limit: int = 50) -> list[RawEvent]: ...
 ```
 ### 3.4 LLM Pipeline
 ```python
 # backend/app/infrastructure/perception/llm_pipeline.py
 class LlmPipeline:
    """Runs three sequential LLM steps on a regulation event."""
    def extract_structure(self, event: dict) -> dict:
        """Step 1: Extract obligations, deadlines, scope, penalties, impact_level.
        Returns dict with keys: obligations, deadlines, scope, penalties, impact_level.
        Uses JSON-mode or structured prompt; model retries once on parse failure.
        """
    def assess_impact(self, event: dict, retrieval_service) -> list[dict]:
        """Step 2: RAG-based impact on existing knowledge base documents.
        Query = standard_code + title + first obligation texts.
        Returns list of {doc_id, doc_name, score, key_clauses, recommendation}.
        """
    def compute_diff(self, old_text: str, new_text: str) -> dict:
        """Step 3: Semantic diff between old and new regulation text.
        Splits both texts by paragraph. Calls existing EmbeddingService (text-embedding-v3
        via EMBEDDING_BASE_URL) to embed each paragraph, then computes cosine similarity.
        Changed paragraphs (cosine < 0.85) sent to LLM for change_type classification:
          'tightened' | 'relaxed' | 'added' | 'removed'
        Returns {changed_sections: [...], change_summary: str}.
        Only called when content_hash differs from previous_hash.
        """
 ```
 ### 3.5 CrawlService
 ```python
 # backend/app/application/perception/crawl_service.py
 class CrawlService:
    def __init__(self, crawlers, event_store, llm_pipeline, retrieval_service): ...
    def run_crawl(self, sources: list[str] | None = None) -> Generator[dict, None, None]:
        """Manual-trigger crawl. Yields progress SSE dicts:
          {event: 'progress', data: {source, fetched, new, updated, stage}}
          {event: 'done', data: {total_new, total_updated, duration_ms}}
          {event: 'error', data: {source, message}}
        For each crawler:
          1. fetch() RawEvents
          2. hash check vs stored event → skip if unchanged
          3. upsert raw event to DB
          4. run LLM pipeline (extract → assess → diff)
          5. upsert enriched event to DB
          6. yield progress
        """
 ```
 ---
 ## 4. API Endpoints
 ### Existing (unchanged interface, new store backend)
 - `GET /api/v1/perception/stats`
 - `GET /api/v1/perception/events`
 - `GET /api/v1/perception/events/{id}`
 - `POST /api/v1/perception/events/{id}/analyze` (streaming)
 ### New endpoints
 ```
 POST /api/v1/perception/crawl
  Body: { sources?: ["CATARC", "国标委", "EUR-Lex", "UN-ECE"] }
  Response: text/event-stream (SSE)
  Auth: requires current_user (admin/legal role)
  Streams progress events until done or error.
 POST /api/v1/perception/events/{id}/process
  Trigger LLM pipeline for a single already-crawled event.
  Response: { status: "ok", processed_at: "..." }
  Auth: requires current_user
 GET /api/v1/perception/events/{id}/diff
  Returns: { changed_sections: [...], change_summary: str, previous_hash: str }
  Returns 404 if no diff available (first crawl or no change detected).
 ```
 ---
 ## 5. Frontend Changes
 ### 5.1 New: Crawl Control Bar (top of PerceptionPage)
 Above the stats-bar, add a `<CrawlBar>` component:
 - "刷新数据源" button — triggers `POST /crawl` (all sources)
 - Inline progress display: shows SSE progress events as a mini status line
  - e.g. "CATARC: 抓取中… | 国标委: 12 条新增 | EUR-Lex: 等待中"
 - On completion: shows "更新完成 — 新增 N 条，更新 M 条"
 - Disabled while crawl is in progress (prevents double-trigger)
 ### 5.2 Signal Card Enhancement
 Existing cards get two new indicators:
 - **NEW badge** — shown when `crawled_at` is within last 24h (green dot)
 - **CHANGED badge** — shown when `previous_hash != content_hash` and `change_summary` exists
 ### 5.3 Right Panel — Structured Tab
 Right detail panel adds a tab bar: **概览 | 义务条款 | 影响评估 | 变更对比**
 **义务条款 tab:**
 - Table: 义务描述 | 主体 | 对象 | 截止日期
 - Tags for deontic type: 强制 / 禁止 / 允许
 - Shows `obligations[]` + `deadlines[]` from DB
 **影响评估 tab:**
 - Replaces hardcoded MOCK_DOCS with real `affected_docs[]` from DB
 - Each row: document name, similarity score (%), key clause excerpt, LLM recommendation
 - "Run fresh assessment" button → triggers `POST /events/{id}/process`
 **变更对比 tab:**
 - Only visible when `change_summary` is non-null
 - Top: `change_summary` text (LLM prose)
 - Below: diff table with old/new paragraph pairs, change_type badge per row
 - Hidden (tab disabled) on first-crawl events with no prior version
 ### 5.4 Existing behavior preserved
 - `analyze` streaming (AI analysis) unchanged
 - Search/filter (source, impact) unchanged — now hits real DB data
 - Stats bar — now reflects real counts from PostgreSQL
 ---
 ## 6. Settings Additions
 ```python
 # backend/app/config/settings.py additions
 perception_crawl_timeout_seconds: int = Field(default=120, ...)
 perception_max_events_per_source: int = Field(default=100, ...)
 perception_diff_similarity_threshold: float = Field(default=0.85, ...)
 ```
 ```env
 # .env additions
 PERCEPTION_CRAWL_TIMEOUT_SECONDS=120
 PERCEPTION_MAX_EVENTS_PER_SOURCE=100
 PERCEPTION_DIFF_SIMILARITY_THRESHOLD=0.85
 ```
 ---
 ## 7. Dependencies
 ```
 # requirements.txt additions
 httpx>=0.27.0              # already likely present; confirm
 beautifulsoup4>=4.12.0     # HTML parsing for CATARC
 lxml>=5.0.0                # BeautifulSoup parser backend
 # sentence-transformers NOT added — diff uses existing text-embedding-v3 API (EMBEDDING_BASE_URL)
 ```
 No new infrastructure required (PostgreSQL + MinIO + Milvus already available).
 ---
 ## 8. Backward Compatibility
 - `DOCUMENT_REPOSITORY_BACKEND=json` → `bootstrap.py` uses `MockEventStore` (unchanged behavior)
 - `DOCUMENT_REPOSITORY_BACKEND=postgres` → uses `PostgresEventStore`
 - Migration: run `CREATE TABLE` SQL on first startup (idempotent `CREATE TABLE IF NOT EXISTS`)
 - Existing 20 mock events are not seeded to PostgreSQL; PostgreSQL starts empty until first crawl
 ---
 ## 9. Out of Scope (this phase)
 - Automatic/scheduled crawling (Celery Beat) — manual trigger only
 - Playwright-based JS-rendered pages — all target sites work with httpx
 - Knowledge Graph (Neo4j / LightRAG) — future phase
 - Email/Slack webhook notifications — future phase
 - User-facing diff history (versioning beyond one prior snapshot) — future phase
--- a/frontend/src/App.tsx
+++ b/frontend/src/App.tsx
@@ -1,12 +1,14 @@
 import './styles/globals.css';
-import { ThemeProvider, AuthProvider } from './contexts';
+import { ThemeProvider, AuthProvider, PageStateProvider } from './contexts';
 import { AppRouter } from './router/AppRouter';
 function App() {
  return (
    <ThemeProvider>
      <AuthProvider>
        <PageStateProvider>
          <AppRouter />
        </PageStateProvider>
      </AuthProvider>
    </ThemeProvider>
  );
--- a/frontend/src/contexts/PageStateContext.tsx
+++ b/frontend/src/contexts/PageStateContext.tsx
@@ -0,0 +1,211 @@
 /**
 * PageStateContext — preserves page-level session state across route changes.
 *
 * When React Router unmounts a page component, all its useState values are lost.
 * This context lives above the router and holds the state that must survive
 * navigation so users can switch modules and return without losing their work.
 *
 * Covered pages:
 *  - RagChat:    message history, citation rail, sessionId, input draft
 *  - Compliance: analysis result (sources, findings, conclusion, meta)
 *  - Perception: selected signal, filter state, AI analysis output
 */
 import React, { createContext, useContext, useState, useCallback, useRef } from 'react';
 // ── RagChat types ─────────────────────────────────────────────────────────────
 export interface RagMessage {
  id: string;
  role: 'user' | 'assistant';
  text: string;
  citationRefs?: number[];
 }
 export interface RagCitation {
  index: number;
  score: number;
  name: string;
  clause: string;
  snippet: string;
  docId?: string;
 }
 export interface RagChatState {
  messages: RagMessage[];
  citations: RagCitation[];
  sessionId: string | null;
  inputDraft: string;
 }
 const RAG_INIT: RagChatState = {
  messages: [
    {
      id: 'init',
      role: 'assistant',
      text: 'Hello! I can answer questions about your indexed regulations and compliance documents. Try asking about EU AI Act requirements, MIIT rules, or ISO/SAE 21434 scope.',
    },
  ],
  citations: [],
  sessionId: null,
  inputDraft: '',
 };
 // ── Compliance types ──────────────────────────────────────────────────────────
 export interface ComplianceSourceEvent {
  standard: string;
  clause: string;
  score: number;
  status: string;
  full_content: string;
 }
 export interface ComplianceFindingEvent {
  title: string;
  desc: string;
  status: 'ok' | 'warn' | 'risk';
  clause_ref?: string;
 }
 export interface ComplianceActionItem {
  label: string;
  value: string;
  risk?: boolean;
 }
 export interface ComplianceDonePayload {
  conclusion: string;
  actions: ComplianceActionItem[];
  risk_score: number;
  highlight_terms: string[];
  para_text: string;
 }
 export interface ComplianceMeta {
  title: string;
  sourceType: 'text' | 'doc' | 'upload';
  startedAt: string;
 }
 export type ComplianceStatus = 'idle' | 'streaming' | 'done' | 'error';
 export interface ComplianceState {
  status: ComplianceStatus;
  stageLabel: string;
  stageKey: string;
  meta: ComplianceMeta | null;
  sources: ComplianceSourceEvent[];
  findings: ComplianceFindingEvent[];
  done: ComplianceDonePayload | null;
  errorText: string;
 }
 const COMPLIANCE_INIT: ComplianceState = {
  status: 'idle',
  stageLabel: '',
  stageKey: '',
  meta: null,
  sources: [],
  findings: [],
  done: null,
  errorText: '',
 };
 // ── Perception types ──────────────────────────────────────────────────────────
 export interface PerceptionSignal {
  id: string;
  source: string;
  standard: string;
  status: 'ok' | 'warn' | 'risk' | 'info';
  title: string;
  summary: string;
  date: string;
  tags: string[];
  impact: 'High' | 'Medium' | 'Low';
 }
 export interface PerceptionPageState {
  signals: PerceptionSignal[];
  searchQuery: string;
  sourceFilter: string;
  impactFilter: string;
  selectedId: string | null;
  aiOutput: string;
  detailTab: 'overview' | 'obligations' | 'assessment' | 'diff';
  crawlStatus: string;
 }
 const PERCEPTION_INIT: PerceptionPageState = {
  signals: [],
  searchQuery: '',
  sourceFilter: 'All',
  impactFilter: 'All',
  selectedId: null,
  aiOutput: '',
  detailTab: 'overview',
  crawlStatus: '',
 };
 // ── Context value ─────────────────────────────────────────────────────────────
 interface PageStateContextValue {
  // RagChat
  ragState: RagChatState;
  setRagState: React.Dispatch<React.SetStateAction<RagChatState>>;
  ragStreamingRef: React.MutableRefObject<boolean>;
  ragAbortRef: React.MutableRefObject<AbortController | null>;
  // Compliance
  complianceState: ComplianceState;
  setComplianceState: React.Dispatch<React.SetStateAction<ComplianceState>>;
  complianceAbortRef: React.MutableRefObject<AbortController | null>;
  resetCompliance: () => void;
  // Perception
  perceptionState: PerceptionPageState;
  setPerceptionState: React.Dispatch<React.SetStateAction<PerceptionPageState>>;
  perceptionAbortRef: React.MutableRefObject<AbortController | null>;
  perceptionCrawlAbortRef: React.MutableRefObject<AbortController | null>;
 }
 const PageStateContext = createContext<PageStateContextValue | null>(null);
 // ── Provider ──────────────────────────────────────────────────────────────────
 export function PageStateProvider({ children }: { children: React.ReactNode }) {
  const [ragState, setRagState] = useState<RagChatState>(RAG_INIT);
  const ragStreamingRef = useRef(false);
  const ragAbortRef = useRef<AbortController | null>(null);
  const [complianceState, setComplianceState] = useState<ComplianceState>(COMPLIANCE_INIT);
  const complianceAbortRef = useRef<AbortController | null>(null);
  const resetCompliance = useCallback(() => {
    complianceAbortRef.current?.abort();
    setComplianceState(COMPLIANCE_INIT);
  }, []);
  const [perceptionState, setPerceptionState] = useState<PerceptionPageState>(PERCEPTION_INIT);
  const perceptionAbortRef = useRef<AbortController | null>(null);
  const perceptionCrawlAbortRef = useRef<AbortController | null>(null);
  return (
    <PageStateContext.Provider value={{
      ragState, setRagState, ragStreamingRef, ragAbortRef,
      complianceState, setComplianceState, complianceAbortRef, resetCompliance,
      perceptionState, setPerceptionState, perceptionAbortRef, perceptionCrawlAbortRef,
    }}>
      {children}
    </PageStateContext.Provider>
  );
 }
 // ── Hook ──────────────────────────────────────────────────────────────────────
 export function usePageState() {
  const ctx = useContext(PageStateContext);
  if (!ctx) throw new Error('usePageState must be used inside PageStateProvider');
  return ctx;
 }
--- a/frontend/src/contexts/index.ts
+++ b/frontend/src/contexts/index.ts
@@ -1,3 +1,18 @@
 export { ThemeProvider, useTheme } from './ThemeContext';
 export { AuthProvider, useAuth } from './AuthContext';
 export type { AuthUser } from './AuthContext';
 export { PageStateProvider, usePageState } from './PageStateContext';
 export type {
  RagChatState,
  RagMessage,
  RagCitation,
  ComplianceState,
  ComplianceStatus,
  ComplianceSourceEvent,
  ComplianceFindingEvent,
  ComplianceDonePayload,
  ComplianceMeta,
  ComplianceActionItem,
  PerceptionPageState,
  PerceptionSignal,
 } from './PageStateContext';
--- a/frontend/src/pages/Compliance/useComplianceAnalysis.ts
+++ b/frontend/src/pages/Compliance/useComplianceAnalysis.ts
@@ -1,4 +1,25 @@
-import { useState, useCallback, useRef } from 'react';
+/**
 * useComplianceAnalysis — compliance analysis state wired to PageStateContext.
 *
 * State is stored in the global context so it persists when the user navigates
 * to another module and returns. The `run` and `reset` actions are identical
 * to the previous hook API so CompliancePage needs no structural changes.
 */
 import { useCallback } from 'react';
 import { usePageState } from '../../contexts';
 import type {
  ComplianceMeta,
  ComplianceState,
  ComplianceSourceEvent,
  ComplianceFindingEvent,
  ComplianceDonePayload,
 } from '../../contexts';
 export type { ComplianceMeta, ComplianceState, ComplianceSourceEvent as SourceEvent, ComplianceFindingEvent as FindingEvent, ComplianceDonePayload as DonePayload };
 export type { ComplianceActionItem as ActionItem } from '../../contexts';
 export type AnalysisStatus = import('../../contexts').ComplianceStatus;
 export type AnalysisMeta = ComplianceMeta;
 const TOKEN_KEY = 'auth_token';
 function authHeader(): Record<string, string> {
@@ -6,55 +27,7 @@ function authHeader(): Record<string, string> {
  return t ? { Authorization: `Bearer ${t}` } : {};
 }
-export type AnalysisStatus = 'idle' | 'streaming' | 'done' | 'error';
+const INITIAL_STATE: ComplianceState = {
 export interface SourceEvent {
  standard: string;
  clause: string;
  score: number;
  status: string;
  full_content: string;
 }
 export interface FindingEvent {
  title: string;
  desc: string;
  status: 'ok' | 'warn' | 'risk';
  clause_ref?: string;
 }
 export interface ActionItem {
  label: string;
  value: string;
  risk?: boolean;
 }
 export interface DonePayload {
  conclusion: string;
  actions: ActionItem[];
  risk_score: number;
  highlight_terms: string[];
  para_text: string;
 }
 export interface AnalysisMeta {
  title: string;
  sourceType: 'text' | 'doc' | 'upload';
  startedAt: string; // ISO timestamp
 }
 export interface AnalysisState {
  status: AnalysisStatus;
  stageLabel: string;
  stageKey: string;
  meta: AnalysisMeta | null;
  sources: SourceEvent[];
  findings: FindingEvent[];
  done: DonePayload | null;
  errorText: string;
 }
 const INITIAL_STATE: AnalysisState = {
  status: 'idle',
  stageLabel: '',
  stageKey: '',
@@ -66,18 +39,12 @@ const INITIAL_STATE: AnalysisState = {
 };
 export function useComplianceAnalysis() {
-  const [state, setState] = useState<AnalysisState>(INITIAL_STATE);
+  const { complianceState: state, setComplianceState: setState, complianceAbortRef, resetCompliance: reset } = usePageState();
  const abortRef = useRef<AbortController | null>(null);
-  const reset = useCallback(() => {
+  const run = useCallback(async (formData: FormData, meta: ComplianceMeta) => {
-    abortRef.current?.abort();
+    complianceAbortRef.current?.abort();
    setState(INITIAL_STATE);
  }, []);
  const run = useCallback(async (formData: FormData, meta: AnalysisMeta) => {
    abortRef.current?.abort();
    const ctrl = new AbortController();
-    abortRef.current = ctrl;
+    complianceAbortRef.current = ctrl;
    setState({ ...INITIAL_STATE, status: 'streaming', stageLabel: 'Starting…', meta });
@@ -124,7 +91,7 @@ export function useComplianceAnalysis() {
            if (j.type === 'stage') {
              setState(s => ({ ...s, stageLabel: j.label ?? '', stageKey: j.stage ?? '' }));
            } else if (j.type === 'source') {
-              const src: SourceEvent = {
+              const src: ComplianceSourceEvent = {
                standard: j.standard ?? '',
                clause: j.clause ?? '',
                score: j.score ?? 0,
@@ -133,7 +100,7 @@ export function useComplianceAnalysis() {
              };
              setState(s => ({ ...s, sources: [...s.sources, src] }));
            } else if (j.type === 'finding') {
-              const finding: FindingEvent = {
+              const finding: ComplianceFindingEvent = {
                title: j.title ?? '',
                desc: j.desc ?? '',
                status: j.status ?? 'info',
@@ -141,7 +108,7 @@ export function useComplianceAnalysis() {
              };
              setState(s => ({ ...s, findings: [...s.findings, finding] }));
            } else if (j.type === 'done') {
-              const payload: DonePayload = {
+              const payload: ComplianceDonePayload = {
                conclusion: j.conclusion ?? '',
                actions: j.actions ?? [],
                risk_score: j.risk_score ?? 0,
@@ -162,7 +129,7 @@ export function useComplianceAnalysis() {
      if (e instanceof Error && e.name === 'AbortError') return;
      setState(s => ({ ...s, status: 'error', errorText: String(e) }));
    }
-  }, []);
+  }, [setState, complianceAbortRef]);
  return { state, run, reset };
 }
--- a/frontend/src/pages/Perception/PerceptionPage.tsx
+++ b/frontend/src/pages/Perception/PerceptionPage.tsx
@@ -1,6 +1,8 @@
 import { useState, useEffect, useRef } from 'react';
 import { Topbar } from '../../components/layout/Topbar';
 import { RefreshCw, Play, Square, ExternalLink } from 'lucide-react';
 import { usePageState } from '../../contexts';
 import type { PerceptionSignal } from '../../contexts';
 const TOKEN_KEY = 'auth_token';
 function authHeader(): Record<string, string> {
@@ -8,18 +10,6 @@ function authHeader(): Record<string, string> {
  return t ? { Authorization: `Bearer ${t}` } : {};
 }
 interface Signal {
  id: string;
  source: string;
  standard: string;
  status: 'ok' | 'warn' | 'risk' | 'info';
  title: string;
  summary: string;
  date: string;
  tags: string[];
  impact: 'High' | 'Medium' | 'Low';
 }
 interface Stats {
  total: number;
  high_impact: number;
@@ -27,29 +17,17 @@ interface Stats {
  last_90_days: number;
 }
 interface DocResult {
  score: number;
  name: string;
  clause: string;
  snippet: string;
 }
 const SOURCES = ['All', 'MIIT', 'UN-ECE', 'ISO', 'GB Comm.', 'EUR-Lex', 'IATF'];
 const IMPACTS = ['All', 'High', 'Medium', 'Low'];
-// Backend /api/v1/perception/stats returns:
+// Backend event → Signal
-// { total, high_impact, medium_impact, last_90_days } — field names match, ✓
+function mapEvent(e: Record<string, unknown>): PerceptionSignal {
 // Backend /api/v1/perception/events returns:
 // { events: [{ id, title, summary, source, standard, impact_level, published_at, tags, status }] }
 // Map backend event fields → frontend Signal shape
 function mapEvent(e: Record<string, unknown>): Signal {
  const impact = String(e.impact_level ?? '').toLowerCase();
  const backendStatus = String(e.status ?? '').toLowerCase();
  return {
    id: String(e.id ?? e.event_id ?? ''),
    source: String(e.source ?? ''),
-    standard: String(e.standard ?? e.regulation_id ?? ''),
+    standard: String(e.standard ?? e.standard_code ?? e.regulation_id ?? ''),
    status: backendStatus === 'high' || backendStatus === 'urgent' ? 'risk'
          : backendStatus === 'medium' || backendStatus === 'draft' ? 'warn'
          : backendStatus === 'low' || backendStatus === 'final' ? 'ok'
@@ -62,50 +40,40 @@ function mapEvent(e: Record<string, unknown>): Signal {
  };
 }
-const MOCK_SIGNALS: Signal[] = [
+const MOCK_SIGNALS: PerceptionSignal[] = [
  {
    id: '1', source: 'EUR-Lex', standard: 'EU/2024/1689', status: 'risk',
    title: 'EU AI Act — High-risk AI in vehicles',
    summary: 'Article 9 mandates risk management systems for automotive AI classifying as high-risk under Annex III point 3.',
-    date: '2025-11-18', tags: ['automotive', 'GDPR', 'certification'], impact: 'High'
+    date: '2025-11-18', tags: ['automotive', 'GDPR', 'certification'], impact: 'High',
  },
  {
    id: '2', source: 'MIIT', standard: 'Draft-2025-08', status: 'warn',
    title: 'MIIT Draft — in-vehicle AI training data',
    summary: 'Draft regulation requires OEM data provenance documentation and OTA audit trails for AI systems.',
-    date: '2025-10-30', tags: ['OTA', 'data-governance', 'China'], impact: 'High'
+    date: '2025-10-30', tags: ['OTA', 'data-governance', 'China'], impact: 'High',
  },
  {
    id: '3', source: 'ISO', standard: 'ISO/SAE 21434:2021/Amd1', status: 'info',
    title: 'ISO/SAE 21434 Amendment 1',
    summary: 'Amendment clarifies CSMS scope for software-only updates and vulnerability disclosure timelines.',
-    date: '2025-10-05', tags: ['cybersecurity', 'CSMS', 'ISO'], impact: 'Medium'
+    date: '2025-10-05', tags: ['cybersecurity', 'CSMS', 'ISO'], impact: 'Medium',
  },
  {
    id: '4', source: 'UN-ECE', standard: 'UNECE WP.29 R155', status: 'ok',
    title: 'UNECE R155 Corrigendum',
    summary: 'Editorial corrections to cybersecurity management system requirements. No substantive changes.',
    date: '2025-09-12', tags: ['type-approval', 'UNECE'], impact: 'Low'
  },
 ];
 const MOCK_DOCS: DocResult[] = [
  { score: 94, name: 'Vehicle AI Safety Manual v3.2', clause: '§4.2.1', snippet: 'The risk management process shall identify and evaluate risks arising from AI system decisions in safety-critical scenarios...' },
  { score: 87, name: 'ADAS System Requirements', clause: '§7.1', snippet: 'Automated driving functions must document training data lineage and model performance envelopes prior to deployment.' },
  { score: 71, name: 'Type Approval Documentation', clause: 'Annex B', snippet: 'Cybersecurity management system certification requires third-party audit of AI decision audit logs retention policy.' },
 ];
 export function PerceptionPage() {
-  const [stats, setStats] = useState<Stats | null>(null);
+  // Persistent state lives in PageStateContext — survives route changes
-  const [signals, setSignals] = useState<Signal[]>(MOCK_SIGNALS);
+  const { perceptionState, setPerceptionState, perceptionAbortRef, perceptionCrawlAbortRef } = usePageState();
-  const [searchQuery, setSearchQuery] = useState('');
+  const { signals, searchQuery, sourceFilter, impactFilter, selectedId, aiOutput, detailTab, crawlStatus } = perceptionState;
  const [sourceFilter, setSourceFilter] = useState('All');
  const [impactFilter, setImpactFilter] = useState('All');
  const [selected, setSelected] = useState<Signal | null>(null);
  const [streaming, setStreaming] = useState(false);
  const [aiOutput, setAiOutput] = useState('');
  const abortRef = useRef<AbortController | null>(null);
  // Stats and selectedFull are lightweight to re-fetch on mount
  const [stats, setStats] = useState<Stats | null>(null);
  const [streaming, setStreaming] = useState(false);
  const [crawling, setCrawling] = useState(false);
  // Full event detail — re-fetched when selected changes or page mounts with a selection
  const [selectedFull, setSelectedFull] = useState<Record<string, unknown> | null>(null);
  // Re-fetch stats every time the page mounts
  useEffect(() => {
    fetch('/api/v1/perception/stats', { headers: authHeader() })
      .then(r => r.json())
@@ -113,16 +81,36 @@ export function PerceptionPage() {
      .catch(() => setStats({ total: 47, high_impact: 7, medium_impact: 18, last_90_days: 14 }));
  }, []);
  // Fetch signal list on first mount only (if empty), otherwise preserve context state
  useEffect(() => {
    if (signals.length > 0) return; // already loaded
    fetch('/api/v1/perception/events?limit=100', { headers: authHeader() })
      .then(r => r.json())
      .then(d => {
        if (Array.isArray(d?.events) && d.events.length > 0) {
-          setSignals(d.events.map(mapEvent));
+          setPerceptionState(s => ({ ...s, signals: d.events.map(mapEvent) }));
        } else {
          setPerceptionState(s => ({ ...s, signals: MOCK_SIGNALS }));
        }
      })
-      .catch(() => { /* keep mock data on error */ });
+      .catch(() => {
-  }, []);
+        setPerceptionState(s => ({ ...s, signals: s.signals.length > 0 ? s.signals : MOCK_SIGNALS }));
      });
  }, []); // eslint-disable-line react-hooks/exhaustive-deps
  // Re-fetch full event detail when navigating back with a selected signal
  useEffect(() => {
    if (selectedId) {
      fetch(`/api/v1/perception/events/${selectedId}`, { headers: authHeader() })
        .then(r => r.ok ? r.json() : null)
        .then(d => { if (d) setSelectedFull(d); })
        .catch(() => {});
    } else {
      setSelectedFull(null);
    }
  }, [selectedId]);
  const selected = signals.find(s => s.id === selectedId) ?? null;
  const filtered = signals.filter(s => {
    if (sourceFilter !== 'All' && s.source !== sourceFilter) return false;
@@ -137,13 +125,20 @@ export function PerceptionPage() {
  function runAnalysis() {
    if (!selected) return;
    setStreaming(true);
-    setAiOutput('');
+    setPerceptionState(s => ({ ...s, aiOutput: '' }));
    const ctrl = new AbortController();
-    abortRef.current = ctrl;
+    perceptionAbortRef.current = ctrl;
-    // Backend: POST /api/v1/perception/events/{id}/analyze  → SSE stream
+    fetch(`/api/v1/perception/events/${selected.id}/analyze`, {
-    fetch(`/api/v1/perception/events/${selected.id}/analyze`, { method: 'POST', headers: authHeader(), signal: ctrl.signal })
+      method: 'POST',
      headers: authHeader(),
      signal: ctrl.signal,
    })
      .then(async res => {
-        if (!res.body) { setAiOutput('No stream available.'); setStreaming(false); return; }
+        if (!res.body) {
          setPerceptionState(s => ({ ...s, aiOutput: 'No stream available.' }));
          setStreaming(false);
          return;
        }
        const reader = res.body.getReader();
        const dec = new TextDecoder();
        let buf = '';
@@ -160,30 +155,99 @@ export function PerceptionPage() {
            if (!raw || raw === '[DONE]') continue;
            try {
              const j = JSON.parse(raw);
-              if (j.text) setAiOutput(p => p + j.text);
+              if (j.text) setPerceptionState(s => ({ ...s, aiOutput: s.aiOutput + j.text }));
-              else if (typeof j === 'string') setAiOutput(p => p + j);
+              else if (typeof j === 'string') setPerceptionState(s => ({ ...s, aiOutput: s.aiOutput + j }));
            } catch {
-              setAiOutput(p => p + raw);
+              setPerceptionState(s => ({ ...s, aiOutput: s.aiOutput + raw }));
            }
          }
        }
        setStreaming(false);
      })
      .catch(e => {
-        if (e.name !== 'AbortError') setAiOutput('Analysis failed. Check API connection.');
+        if (e.name !== 'AbortError') setPerceptionState(s => ({ ...s, aiOutput: 'Analysis failed. Check API connection.' }));
        setStreaming(false);
      });
  }
  function stopAnalysis() {
-    abortRef.current?.abort();
+    perceptionAbortRef.current?.abort();
    setStreaming(false);
  }
-  function selectSignal(sig: Signal) {
+  async function runCrawl() {
-    setSelected(sig);
+    setCrawling(true);
-    setAiOutput('');
+    setPerceptionState(s => ({ ...s, crawlStatus: '正在连接数据源...' }));
    try {
      const res = await fetch('/api/v1/perception/crawl', {
        method: 'POST',
        headers: { 'Content-Type': 'application/json', ...authHeader() },
        body: JSON.stringify({}),
      });
      if (!res.body) {
        setPerceptionState(s => ({ ...s, crawlStatus: 'No stream' }));
        setCrawling(false);
        return;
      }
      const reader = res.body.getReader();
      const dec = new TextDecoder();
      let buf = '';
      while (true) {
        const { done, value } = await reader.read();
        if (done) break;
        buf += dec.decode(value);
        const parts = buf.split('\n\n');
        buf = parts.pop() ?? '';
        for (const block of parts) {
          const eventLine = block.split('\n').find(l => l.startsWith('event: '));
          const dataLine = block.split('\n').find(l => l.startsWith('data: '));
          const evtName = eventLine?.slice(7).trim();
          const raw = dataLine?.slice(6).trim();
          if (!raw) continue;
          try {
            const d = JSON.parse(raw);
            if (evtName === 'progress') {
              setPerceptionState(s => ({
                ...s,
                crawlStatus: `${d.source}: ${d.stage === 'fetching' ? '抓取中...' : d.stage === 'processing' ? `处理 ${d.fetched} 条...` : `完成 +${d.new} 条`}`,
              }));
            } else if (evtName === 'done') {
              setPerceptionState(s => ({ ...s, crawlStatus: `更新完成 — 新增 ${d.total_new} 条，更新 ${d.total_updated} 条` }));
              fetch('/api/v1/perception/events?limit=100', { headers: authHeader() })
                .then(r => r.json())
                .then(d2 => {
                  if (Array.isArray(d2?.events)) {
                    setPerceptionState(s => ({ ...s, signals: d2.events.map(mapEvent) }));
                  }
                });
            } else if (evtName === 'error') {
              setPerceptionState(s => ({
                ...s,
                crawlStatus: `错误: ${typeof d === 'string' ? d : d.message}`,
              }));
            }
          } catch { /* ignore */ }
        }
      }
    } catch (e: unknown) {
      setPerceptionState(s => ({
        ...s,
        crawlStatus: `连接失败: ${e instanceof Error ? e.message : String(e)}`,
      }));
    }
    setCrawling(false);
  }
  function selectSignal(sig: PerceptionSignal) {
    setPerceptionState(s => ({
      ...s,
      selectedId: sig.id,
      aiOutput: '',
      detailTab: 'overview',
    }));
    setSelectedFull(null);
    setStreaming(false);
    perceptionAbortRef.current?.abort();
  }
  return (
@@ -197,10 +261,18 @@ export function PerceptionPage() {
              <input
                placeholder="Search signals..."
                value={searchQuery}
-                onChange={e => setSearchQuery(e.target.value)}
+                onChange={e => setPerceptionState(s => ({ ...s, searchQuery: e.target.value }))}
              />
            </div>
-            <button className="btn sm"><RefreshCw size={13} />Refresh</button>
+            <button className="btn sm primary" onClick={runCrawl} disabled={crawling}>
              <RefreshCw size={13} className={crawling ? 'spin' : ''} />
              {crawling ? '抓取中...' : '刷新数据源'}
            </button>
            {crawlStatus && (
              <span style={{ fontSize: 12, color: 'var(--text-secondary)', marginLeft: 8 }}>
                {crawlStatus}
              </span>
            )}
          </>
        }
      />
@@ -227,13 +299,25 @@ export function PerceptionPage() {
      <div className="filter-bar">
        <div className="chip-group">
          {SOURCES.map(s => (
-            <button key={s} className={`chip${sourceFilter === s ? ' active' : ''}`} onClick={() => setSourceFilter(s)}>{s}</button>
+            <button
              key={s}
              className={`chip${sourceFilter === s ? ' active' : ''}`}
              onClick={() => setPerceptionState(st => ({ ...st, sourceFilter: s }))}
            >
              {s}
            </button>
          ))}
        </div>
        <div className="filter-sep" />
        <div className="chip-group">
          {IMPACTS.map(i => (
-            <button key={i} className={`chip${impactFilter === i ? ' active' : ''}`} onClick={() => setImpactFilter(i)}>{i}</button>
+            <button
              key={i}
              className={`chip${impactFilter === i ? ' active' : ''}`}
              onClick={() => setPerceptionState(st => ({ ...st, impactFilter: i }))}
            >
              {i}
            </button>
          ))}
        </div>
      </div>
@@ -243,7 +327,7 @@ export function PerceptionPage() {
          {filtered.map(sig => (
            <div
              key={sig.id}
-              className={`ev-card${selected?.id === sig.id ? ' selected' : ''}`}
+              className={`ev-card${selectedId === sig.id ? ' selected' : ''}`}
              onClick={() => selectSignal(sig)}
            >
              <div className="ev-top">
@@ -277,8 +361,11 @@ export function PerceptionPage() {
                  <span className="source-tag">{selected.source}</span>
                  <span className="ev-std">{selected.standard}</span>
                  <span className={`status ${selected.status}`}>
-                    {selected.status === 'risk' ? 'Urgent' : 'Published'}
+                    {selected.status === 'risk' ? 'Urgent' : selected.status === 'warn' ? 'Draft' : 'Published'}
                  </span>
                  {selectedFull?.change_summary && (
                    <span className="status warn" style={{ marginLeft: 'auto' }}>CHANGED</span>
                  )}
                </div>
                <div className="detail-title">{selected.title}</div>
                <p className="detail-summary">{selected.summary}</p>
@@ -287,22 +374,159 @@ export function PerceptionPage() {
                    ? <button className="btn sm primary" onClick={runAnalysis}><Play size={12} />Run impact analysis</button>
                    : <button className="btn sm" onClick={stopAnalysis}><Square size={12} />Stop</button>
                  }
-                  <button className="btn sm"><ExternalLink size={12} />Source</button>
+                  {selected && (
                    <a
                      href={(selectedFull?.full_text_url as string) || '#'}
                      target="_blank"
                      rel="noopener noreferrer"
                      className="btn sm"
                    >
                      <ExternalLink size={12} />Source
                    </a>
                  )}
                </div>
              </div>
-              <div className="card docs-card">
+              <div className="detail-tabs">
-                <div className="card-header">Affected documents</div>
+                {(['overview', 'obligations', 'assessment', 'diff'] as const).map(tab => (
-                {MOCK_DOCS.map(d => (
+                  <button
-                  <div key={d.name} className="doc-row">
+                    key={tab}
-                    <span className="doc-score">{d.score}%</span>
+                    className={`detail-tab${detailTab === tab ? ' active' : ''}${tab === 'diff' && !selectedFull?.change_summary ? ' disabled' : ''}`}
-                    <div>
+                    onClick={() => {
-                      <div className="doc-name">{d.name} <span className="doc-clause">{d.clause}</span></div>
+                      if (tab !== 'diff' || selectedFull?.change_summary) {
-                      <div className="doc-snippet">{d.snippet}</div>
+                        setPerceptionState(s => ({ ...s, detailTab: tab }));
                      }
                    }}
                  >
                    {tab === 'overview' ? '概览' : tab === 'obligations' ? '义务条款' : tab === 'assessment' ? '影响评估' : '变更对比'}
                  </button>
                ))}
              </div>
              {detailTab === 'overview' && (
                <div className="card">
                  <div className="card-header">Scope &amp; Summary</div>
                  <p className="detail-summary" style={{ marginTop: 8 }}>
                    {(selectedFull?.scope as string) || selected.summary}
                  </p>
                  {selectedFull?.penalties && (
                    <p style={{ fontSize: 13, color: 'var(--danger)', marginTop: 6 }}>
                      ⚠ {selectedFull.penalties as string}
                    </p>
                  )}
                </div>
              )}
              {detailTab === 'obligations' && (
                <div className="card">
                  <div className="card-header">义务条款</div>
                  {(() => {
                    const obs = (selectedFull?.obligations as Array<Record<string, string>>) || [];
                    const deadlines = (selectedFull?.deadlines as Array<Record<string, string>>) || [];
                    return obs.length === 0 && deadlines.length === 0 ? (
                      <p className="detail-summary" style={{ marginTop: 8 }}>暂无结构化数据。点击右上角"Run impact analysis"触发提取。</p>
                    ) : (
                      <>
                        {obs.length > 0 && (
                          <table style={{ width: '100%', fontSize: 13, borderCollapse: 'collapse', marginTop: 8 }}>
                            <thead>
                              <tr style={{ borderBottom: '1px solid var(--border)' }}>
                                <th style={{ textAlign: 'left', padding: '4px 8px' }}>义务描述</th>
                                <th style={{ textAlign: 'left', padding: '4px 8px', width: 80 }}>主体</th>
                                <th style={{ textAlign: 'left', padding: '4px 8px', width: 60 }}>类型</th>
                              </tr>
                            </thead>
                            <tbody>
                              {obs.map((ob, i) => (
                                <tr key={i} style={{ borderBottom: '1px solid var(--border-faint)' }}>
                                  <td style={{ padding: '6px 8px' }}>{ob.text}</td>
                                  <td style={{ padding: '6px 8px', color: 'var(--text-secondary)' }}>{ob.subject}</td>
                                  <td style={{ padding: '6px 8px' }}>
                                    <span className={`status ${ob.deontic === 'must' || ob.deontic === 'shall' ? 'risk' : ob.deontic === 'prohibited' ? 'risk' : 'info'}`}>
                                      {ob.deontic}
                                    </span>
                                  </td>
                                </tr>
                              ))}
                            </tbody>
                          </table>
                        )}
                        {deadlines.length > 0 && (
                          <div style={{ marginTop: 12 }}>
                            <div className="card-header">截止日期</div>
                            {deadlines.map((d, i) => (
                              <div key={i} style={{ fontSize: 13, padding: '4px 0', display: 'flex', gap: 12 }}>
                                <span style={{ fontWeight: 600, color: 'var(--danger)' }}>{d.date || '待定'}</span>
                                <span style={{ color: 'var(--text-secondary)' }}>{d.description}</span>
                              </div>
                            ))}
                          </div>
                        )}
                      </>
                    );
                  })()}
                </div>
              )}
              {detailTab === 'assessment' && (
                <div className="card docs-card">
                  <div className="card-header">Affected documents</div>
                  {(() => {
                    const docs = (selectedFull?.affected_docs as Array<Record<string, unknown>>);
                    const displayDocs = docs && docs.length > 0 ? docs : [];
                    return displayDocs.length === 0
                      ? <p className="detail-summary" style={{ marginTop: 8 }}>No affected documents found.</p>
                      : displayDocs.map((d, i) => (
                          <div key={i} className="doc-row">
                            <span className="doc-score">{Math.round(Number(d.score ?? 0) * 100)}%</span>
                            <div>
                              <div className="doc-name">
                                {String(d.doc_name || '')}
                                <span className="doc-clause">{String(d.key_clauses || d.clause || '')}</span>
                              </div>
                              {d.snippet && <div className="doc-snippet">{String(d.snippet)}</div>}
                              {d.recommendation && (
                                <div style={{ fontSize: 12, color: 'var(--accent)', marginTop: 2 }}>→ {String(d.recommendation)}</div>
                              )}
                            </div>
                          </div>
                        ));
                  })()}
                </div>
              )}
              {detailTab === 'diff' && selectedFull?.change_summary && (
                <div className="card">
                  <div className="card-header">变更对比</div>
                  <p style={{ fontSize: 13, color: 'var(--text-secondary)', marginTop: 8 }}>
                    {selectedFull.change_summary as string}
                  </p>
                  {(() => {
                    const sections = (selectedFull.changed_sections as Array<Record<string, unknown>>) || [];
                    return sections.map((s, i) => (
                      <div key={i} style={{ marginTop: 12, borderTop: '1px solid var(--border)', paddingTop: 10 }}>
                        <div style={{ display: 'flex', gap: 8, marginBottom: 6 }}>
                          <span className={`status ${s.change_type === 'tightened' || s.change_type === 'added' ? 'risk' : s.change_type === 'removed' ? 'warn' : 'info'}`}>
                            {String(s.change_type)}
                          </span>
                          <span style={{ fontSize: 12, color: 'var(--text-secondary)' }}>cosine: {String(s.similarity)}</span>
                        </div>
                        <div style={{ display: 'grid', gridTemplateColumns: '1fr 1fr', gap: 8, fontSize: 12 }}>
                          <div style={{ background: 'var(--danger-bg)', padding: 8, borderRadius: 4 }}>
                            <div style={{ fontWeight: 600, marginBottom: 4 }}>旧版</div>
                            {String(s.old_text || '')}
                          </div>
                          <div style={{ background: 'var(--success-bg)', padding: 8, borderRadius: 4 }}>
                            <div style={{ fontWeight: 600, marginBottom: 4 }}>新版</div>
                            {String(s.new_text || '')}
                          </div>
                        </div>
                        {s.summary && <p style={{ fontSize: 12, marginTop: 6, color: 'var(--text-secondary)' }}>{String(s.summary)}</p>}
                      </div>
                    ));
                  })()}
                </div>
              )}
              {(aiOutput || streaming) && (
                <div className="card ai-card">
--- a/frontend/src/pages/RagChat/RagChatPage.tsx
+++ b/frontend/src/pages/RagChat/RagChatPage.tsx
@@ -1,6 +1,8 @@
-import { useState, useRef, useEffect, useCallback } from 'react';
+import { useRef, useEffect, useCallback, useState } from 'react';
 import { Topbar } from '../../components/layout/Topbar';
 import { Send, Download } from 'lucide-react';
 import { usePageState } from '../../contexts';
 import type { RagCitation } from '../../contexts';
 const TOKEN_KEY = 'auth_token';
 function authHeader(): Record<string, string> {
@@ -8,26 +10,8 @@ function authHeader(): Record<string, string> {
  return t ? { Authorization: `Bearer ${t}` } : {};
 }
 interface Message {
  id: string;
  role: 'user' | 'assistant';
  text: string;
  // citation indices mentioned in this assistant message (1-based, matching citations array)
  citationRefs?: number[];
 }
 interface Citation {
  index: number;   // 1-based, matches [N] markers in text
  score: number;   // 0–100 display percentage
  name: string;    // doc_name
  clause: string;  // section_title or clause
  snippet: string; // preview text
  docId?: string;
 }
 // Map a raw source doc from the backend "retrieved" event to our Citation shape.
-// Backend fields: { id, score(0-1), preview, doc_name, clause, doc_id }
+function mapSource(s: Record<string, unknown>, idx: number): RagCitation {
 function mapSource(s: Record<string, unknown>, idx: number): Citation {
  const rawScore = typeof s.score === 'number' ? s.score : 0;
  const displayScore = rawScore <= 1 ? Math.round(rawScore * 100) : Math.round(rawScore);
  return {
@@ -73,25 +57,21 @@ const MOCK_QUICK = [
 ];
 export function RagChatPage() {
-  const [messages, setMessages] = useState<Message[]>([
+  // All persistent state lives in PageStateContext — survives route changes
-    {
+  const { ragState, setRagState, ragStreamingRef, ragAbortRef } = usePageState();
-      id: 'init', role: 'assistant',
+  const { messages, citations, sessionId, inputDraft } = ragState;
-      text: 'Hello! I can answer questions about your indexed regulations and compliance documents. Try asking about EU AI Act requirements, MIIT rules, or ISO/SAE 21434 scope.',
+
-    }
+  // Local-only UI state: highlighted citation and streaming indicator
-  ]);
+  // These are fine to reset on navigation since they're transient UI feedback
  const [quickPrompts, setQuickPrompts] = useState<string[]>(MOCK_QUICK);
  const [input, setInput] = useState('');
  const [streaming, setStreaming] = useState(false);
  const [citations, setCitations] = useState<Citation[]>([]);
  const [highlightedCit, setHighlightedCit] = useState<number | null>(null);
-  const [sessionId, setSessionId] = useState<string | null>(null);
+  const [streaming, setStreaming] = useState(ragStreamingRef.current);
  const [quickPrompts, setQuickPrompts] = useState<string[]>(MOCK_QUICK);
  const bottomRef = useRef<HTMLDivElement>(null);
  const citRailRef = useRef<HTMLDivElement>(null);
  const citItemRefs = useRef<Record<number, HTMLDivElement | null>>({});
  const abortRef = useRef<AbortController | null>(null);
-  // Fetch quick questions from backend on mount
+  // Fetch quick questions from backend on mount (only once per session)
  useEffect(() => {
    fetch('/api/v1/rag/quick-questions', { headers: authHeader() })
      .then(r => r.json())
@@ -115,26 +95,33 @@ export function RagChatPage() {
    if (el) {
      el.scrollIntoView({ behavior: 'smooth', block: 'nearest' });
    }
    // Clear highlight after 3s
    setTimeout(() => setHighlightedCit(h => h === n ? null : h), 3000);
  }, []);
  async function send(text?: string) {
-    const q = (text ?? input).trim();
+    const q = (text ?? inputDraft).trim();
-    if (!q || streaming) return;
+    if (!q || ragStreamingRef.current) return;
-    setInput('');
+    setRagState(s => ({ ...s, inputDraft: '' }));
    const userMsg: Message = { id: Date.now().toString(), role: 'user', text: q };
    setMessages(m => [...m, userMsg]);
    const userMsgId = Date.now().toString();
    const assistantId = (Date.now() + 1).toString();
-    setMessages(m => [...m, { id: assistantId, role: 'assistant', text: '' }]);
+
    setRagState(s => ({
      ...s,
      messages: [
        ...s.messages,
        { id: userMsgId, role: 'user', text: q },
        { id: assistantId, role: 'assistant', text: '' },
      ],
      citations: [],
    }));
    ragStreamingRef.current = true;
    setStreaming(true);
    setCitations([]);
    setHighlightedCit(null);
    const ctrl = new AbortController();
-    abortRef.current = ctrl;
+    ragAbortRef.current = ctrl;
    try {
      const body: Record<string, unknown> = { query: q, top_k: 5 };
@@ -151,14 +138,13 @@ export function RagChatPage() {
      const reader = res.body.getReader();
      const dec = new TextDecoder();
      let buffer = '';
-      const newCitations: Citation[] = [];
+      const newCitations: RagCitation[] = [];
      while (true) {
        const { done, value } = await reader.read();
        if (done) break;
        buffer += dec.decode(value, { stream: true });
        // SSE blocks separated by double newline
        const blocks = buffer.split('\n\n');
        buffer = blocks.pop() ?? '';
@@ -171,56 +157,62 @@ export function RagChatPage() {
            const j = JSON.parse(raw);
            if (j.type === 'session') {
-              // Backend assigned a session_id — persist for next request
+              if (j.session_id) setRagState(s => ({ ...s, sessionId: j.session_id }));
              if (j.session_id) setSessionId(j.session_id);
            } else if (j.type === 'retrieved' && Array.isArray(j.docs)) {
              // Sources arrive before the answer starts
              const mapped = j.docs.map((d: Record<string, unknown>, i: number) => mapSource(d, i + 1));
              newCitations.push(...mapped);
-              setCitations([...mapped]);
+              setRagState(s => ({ ...s, citations: [...mapped] }));
            } else if (j.type === 'chunk' && j.text) {
-              setMessages(m => m.map(msg =>
+              setRagState(s => ({
                ...s,
                messages: s.messages.map(msg =>
                  msg.id === assistantId
                    ? { ...msg, text: msg.text + (j.text as string) }
                    : msg
-              ));
+                ),
-
+              }));
            } else if (j.type === 'status') {
              // Status message (e.g. "找到N条相关法规…") — could show in UI if desired
              // For now we ignore it to keep the bubble clean
            } else if (j.type === 'done') {
-              // Extract which citation numbers appear in the final answer
+              setRagState(s => ({
-              setMessages(m => m.map(msg => {
+                ...s,
                messages: s.messages.map(msg => {
                  if (msg.id !== assistantId) return msg;
                  const refs = [...new Set(
                    [...msg.text.matchAll(/\[(\d+)\]/g)].map(r => parseInt(r[1], 10))
                  )].filter(n => n >= 1 && n <= newCitations.length);
                  return { ...msg, citationRefs: refs };
                }),
              }));
              break;
            } else if (j.type === 'error') {
-              setMessages(m => m.map(msg =>
+              setRagState(s => ({
                ...s,
                messages: s.messages.map(msg =>
                  msg.id === assistantId
                    ? { ...msg, text: `Error: ${j.text ?? 'Unknown error'}` }
                    : msg
-              ));
+                ),
              }));
            }
          } catch { /* malformed JSON chunk, skip */ }
        }
      }
    } catch (e: unknown) {
      if (e instanceof Error && e.name !== 'AbortError') {
-        setMessages(m => m.map(msg =>
+        setRagState(s => ({
          ...s,
          messages: s.messages.map(msg =>
            msg.id === assistantId
              ? { ...msg, text: 'Could not reach the RAG API. Please check the backend.' }
              : msg
-        ));
+          ),
        }));
      }
    } finally {
      ragStreamingRef.current = false;
      setStreaming(false);
    }
  }
@@ -291,15 +283,15 @@ export function RagChatPage() {
              <textarea
                className="composer-input"
                placeholder="Ask about your regulations…"
-                value={input}
+                value={inputDraft}
-                onChange={e => setInput(e.target.value)}
+                onChange={e => setRagState(s => ({ ...s, inputDraft: e.target.value }))}
                onKeyDown={e => { if (e.key === 'Enter' && !e.shiftKey) { e.preventDefault(); send(); } }}
                rows={2}
              />
              <button
                className="btn primary"
                onClick={() => send()}
-                disabled={!input.trim() || streaming}
+                disabled={!inputDraft.trim() || streaming}
              >
                <Send size={14} />
              </button>
--- a/frontend/src/styles/globals.css
+++ b/frontend/src/styles/globals.css
@@ -1108,3 +1108,33 @@ mark.comp-highlight {
  transition: color 0.15s;
 }
 .logout-btn:hover { color: var(--danger); }
 /* ── Detail Tabs (Perception) ──────────────────── */
 .detail-tabs {
  display: flex;
  gap: 2px;
  margin: 8px 0 0;
  border-bottom: 1px solid var(--border);
  padding-bottom: 0;
 }
 .detail-tab {
  background: none;
  border: none;
  border-bottom: 2px solid transparent;
  padding: 6px 14px;
  font-size: 13px;
  color: var(--text-secondary);
  cursor: pointer;
  transition: color 0.15s, border-color 0.15s;
 }
 .detail-tab:hover { color: var(--text); }
 .detail-tab.active {
  color: var(--accent);
  border-bottom-color: var(--accent);
  font-weight: 600;
 }
 .detail-tab.disabled {
  opacity: 0.35;
  cursor: not-allowed;
 }
 .spin { animation: spin 1s linear infinite; }
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -24,6 +24,8 @@ dependencies = [
    "loguru>=0.7.0",
    "tenacity>=8.2.0",
    "httpx>=0.24.0",
    "beautifulsoup4>=4.12.0",
    "lxml>=5.0.0",
    "alibabacloud-docmind-api20220711>=1.0.6",
    "alibabacloud-tea-openapi>=0.3.11",
    "alibabacloud-tea-util>=0.3.13",