diff --git a/.env b/.env
index 7cef945..a92f5b7 100644
--- a/.env
+++ b/.env
@@ -54,6 +54,11 @@ DOCUMENT_REPOSITORY_BACKEND=json
# Default false: processing runs in FastAPI's threadpool — no external worker needed.
USE_CELERY_WORKER=false
+# ===== 法规感知爬取配置 =====
+PERCEPTION_CRAWL_TIMEOUT_SECONDS=120
+PERCEPTION_MAX_EVENTS_PER_SOURCE=100
+PERCEPTION_DIFF_SIMILARITY_THRESHOLD=0.85
+
# ===== API配置 =====
API_HOST=0.0.0.0
API_PORT=8000
diff --git a/.env.example b/.env.example
index 26131db..13a7539 100644
--- a/.env.example
+++ b/.env.example
@@ -55,6 +55,11 @@ DOCUMENT_REPOSITORY_BACKEND=json
# Default false: document processing runs in FastAPI's threadpool (no external worker needed).
USE_CELERY_WORKER=false
+# ===== 法规感知爬取配置 =====
+PERCEPTION_CRAWL_TIMEOUT_SECONDS=120
+PERCEPTION_MAX_EVENTS_PER_SOURCE=100
+PERCEPTION_DIFF_SIMILARITY_THRESHOLD=0.85
+
# ===== 阿里云文档解析 =====
ALIBABA_ACCESS_KEY_ID=your_aliyun_access_key_id
ALIBABA_ACCESS_KEY_SECRET=your_aliyun_access_key_secret
diff --git a/backend/app/api/routes/perception.py b/backend/app/api/routes/perception.py
index 7470234..e398e49 100644
--- a/backend/app/api/routes/perception.py
+++ b/backend/app/api/routes/perception.py
@@ -4,10 +4,12 @@ from __future__ import annotations
import json
-from fastapi import APIRouter, Query
+from fastapi import APIRouter, Depends, Query
from fastapi.responses import StreamingResponse
-from app.shared.bootstrap import get_perception_service
+from app.shared.bootstrap import get_crawl_service, get_event_store, get_perception_service
+from app.api.dependencies.auth import get_current_user
+from app.domain.auth.models import UserClaims
from app.shared.async_utils import iter_in_thread
router = APIRouter(prefix="/perception", tags=["智能感知"])
@@ -65,3 +67,77 @@ async def analyze_event(event_id: str):
"X-Accel-Buffering": "no",
},
)
+
+
+@router.post("/crawl")
+async def run_crawl(
+ body: dict = None,
+ current_user: UserClaims = Depends(get_current_user),
+):
+ """Trigger manual crawl of regulatory sources. Streams SSE progress.
+
+ Body (optional): {"sources": ["CATARC", "国标委·强制性", "EUR-Lex"]}
+ Omit sources to crawl all registered sources.
+ """
+ sources: list[str] | None = (body or {}).get("sources")
+ crawl_svc = get_crawl_service()
+
+ async def crawl_stream():
+ async for item in iter_in_thread(crawl_svc.run_crawl(sources=sources)):
+ event_name = item.get("event", "message")
+ data = item.get("data", "")
+ if isinstance(data, (dict, list)):
+ data = json.dumps(data, ensure_ascii=False)
+ yield f"event: {event_name}\ndata: {data}\n\n"
+
+ return StreamingResponse(
+ crawl_stream(),
+ media_type="text/event-stream",
+ headers={"Cache-Control": "no-cache", "X-Accel-Buffering": "no"},
+ )
+
+
+@router.post("/events/{event_id}/process")
+async def process_event(
+ event_id: str,
+ current_user: UserClaims = Depends(get_current_user),
+):
+ """Trigger LLM pipeline (extract + assess + diff) for a single event."""
+ from datetime import UTC, datetime
+ from app.infrastructure.perception.llm_pipeline import LlmPipeline
+ from app.shared.bootstrap import get_retrieval_service
+
+ event = get_perception_service().get_event(event_id)
+ if not event:
+ from fastapi import HTTPException
+ raise HTTPException(status_code=404, detail=f"Event {event_id} not found")
+
+ store = get_event_store()
+ pipeline = LlmPipeline()
+
+ structure = pipeline.extract_structure(event)
+ event.update(structure)
+ event["affected_docs"] = pipeline.assess_impact(event, get_retrieval_service())
+ event["processed_at"] = datetime.now(UTC).isoformat()
+ store.upsert(event)
+
+ return {"status": "ok", "event_id": event_id, "processed_at": event["processed_at"]}
+
+
+@router.get("/events/{event_id}/diff")
+async def get_event_diff(event_id: str):
+ """Return semantic diff detail for an event (only available if previously crawled twice)."""
+ event = get_perception_service().get_event(event_id)
+ if not event:
+ from fastapi import HTTPException
+ raise HTTPException(status_code=404, detail=f"Event {event_id} not found")
+ if not event.get("change_summary"):
+ from fastapi import HTTPException
+ raise HTTPException(status_code=404, detail="No diff available for this event")
+ return {
+ "event_id": event_id,
+ "change_summary": event.get("change_summary"),
+ "changed_sections": event.get("changed_sections") or [],
+ "previous_hash": event.get("previous_hash"),
+ "content_hash": event.get("content_hash"),
+ }
diff --git a/backend/app/application/perception/crawl_service.py b/backend/app/application/perception/crawl_service.py
new file mode 100644
index 0000000..afcc452
--- /dev/null
+++ b/backend/app/application/perception/crawl_service.py
@@ -0,0 +1,147 @@
+"""Orchestrates regulatory source crawlers and LLM enrichment pipeline."""
+
+from __future__ import annotations
+
+import hashlib
+from typing import Any, Generator
+
+from loguru import logger
+
+from app.infrastructure.perception.base_event_store import BaseEventStore
+from app.infrastructure.perception.crawlers.base import BaseCrawler, RawEvent
+from app.infrastructure.perception.llm_pipeline import LlmPipeline
+
+
+def _event_id(source: str, standard_code: str) -> str:
+ """Deterministic 12-char ID from source + standard_code."""
+ return hashlib.sha256(f"{source}-{standard_code}".encode()).hexdigest()[:12]
+
+
+def _content_hash(raw_text: str) -> str:
+ return hashlib.sha256(raw_text.encode()).hexdigest()
+
+
+def _raw_to_dict(raw: RawEvent, event_id: str, content_hash: str) -> dict:
+ return {
+ "id": event_id,
+ "source": raw.source,
+ "source_label": raw.source_label,
+ "standard_code": raw.standard_code,
+ "title": raw.title,
+ "summary": raw.summary,
+ "full_text_url": raw.full_text_url,
+ "status": raw.status,
+ "impact_level": "medium",
+ "published_at": raw.published_at,
+ "effective_at": raw.effective_at,
+ "category": raw.category,
+ "tags": raw.tags,
+ "content_hash": content_hash,
+ "previous_hash": None,
+ }
+
+
+class CrawlService:
+ """Orchestrate crawlers, hash-based change detection, and LLM enrichment."""
+
+ def __init__(
+ self,
+ crawlers: dict[str, BaseCrawler],
+ event_store: BaseEventStore,
+ llm_pipeline: LlmPipeline,
+ retrieval_service: Any,
+ ) -> None:
+ self._crawlers = crawlers
+ self._store = event_store
+ self._pipeline = llm_pipeline
+ self._retrieval = retrieval_service
+
+ def run_crawl(
+ self, sources: list[str] | None = None
+ ) -> Generator[dict, None, None]:
+ """Run crawl for selected sources. Yields SSE-ready progress dicts."""
+ targets = sources or list(self._crawlers.keys())
+ total_new = 0
+ total_updated = 0
+
+ for source_key in targets:
+ crawler = self._crawlers.get(source_key)
+ if not crawler:
+ yield {"event": "error", "data": f"Unknown source: {source_key}"}
+ continue
+
+ yield {"event": "progress", "data": {"source": source_key, "stage": "fetching"}}
+ try:
+ raw_events = crawler.fetch(limit=100)
+ except Exception as exc:
+ logger.exception("Crawler failed source={}", source_key)
+ yield {"event": "error", "data": {"source": source_key, "message": str(exc)}}
+ continue
+
+ yield {
+ "event": "progress",
+ "data": {"source": source_key, "stage": "processing", "fetched": len(raw_events)},
+ }
+
+ new_count = 0
+ updated_count = 0
+
+ for raw in raw_events:
+ eid = _event_id(raw.source, raw.standard_code)
+ new_hash = _content_hash(raw.raw_text or raw.title)
+ existing = self._store.get(eid)
+
+ if existing and existing.get("content_hash") == new_hash:
+ continue
+
+ is_update = existing is not None
+ old_text = existing.get("summary", "") if is_update else ""
+ previous_hash = existing.get("content_hash") if is_update else None
+
+ event_dict = _raw_to_dict(raw, eid, new_hash)
+ event_dict["previous_hash"] = previous_hash
+
+ try:
+ structure = self._pipeline.extract_structure(event_dict)
+ event_dict.update(structure)
+ except Exception as exc:
+ logger.warning("Structure extraction failed id={} err={}", eid, exc)
+
+ try:
+ affected = self._pipeline.assess_impact(event_dict, self._retrieval)
+ event_dict["affected_docs"] = affected
+ except Exception as exc:
+ logger.warning("Impact assessment failed id={} err={}", eid, exc)
+
+ if is_update and old_text and raw.raw_text:
+ try:
+ diff = self._pipeline.compute_diff(old_text, raw.raw_text)
+ event_dict["change_summary"] = diff.get("change_summary")
+ event_dict["changed_sections"] = diff.get("changed_sections")
+ except Exception as exc:
+ logger.warning("Diff failed id={} err={}", eid, exc)
+
+ self._store.upsert(event_dict)
+
+ if is_update:
+ updated_count += 1
+ else:
+ new_count += 1
+
+ total_new += new_count
+ total_updated += updated_count
+
+ yield {
+ "event": "progress",
+ "data": {
+ "source": source_key,
+ "stage": "done",
+ "new": new_count,
+ "updated": updated_count,
+ },
+ }
+
+ yield {
+ "event": "done",
+ "data": {"total_new": total_new, "total_updated": total_updated},
+ }
diff --git a/backend/app/application/perception/services.py b/backend/app/application/perception/services.py
index bda2f56..c49cd15 100644
--- a/backend/app/application/perception/services.py
+++ b/backend/app/application/perception/services.py
@@ -6,7 +6,7 @@ import json
from typing import Generator
from app.application.knowledge.services import KnowledgeRetrievalService
-from app.infrastructure.perception.mock_event_store import MockEventStore
+from app.infrastructure.perception.base_event_store import BaseEventStore
from app.services.llm.llm_factory import get_llm_client
from app.config.settings import settings
@@ -22,7 +22,7 @@ class PerceptionService:
def __init__(
self,
- event_store: MockEventStore,
+ event_store: BaseEventStore,
retrieval_service: KnowledgeRetrievalService,
) -> None:
self._store = event_store
diff --git a/backend/app/config/settings.py b/backend/app/config/settings.py
index ffdd480..917ab51 100644
--- a/backend/app/config/settings.py
+++ b/backend/app/config/settings.py
@@ -87,6 +87,18 @@ class Settings(BaseSettings):
# no external worker needed. Switch to True only when a Celery worker is running.
use_celery_worker: bool = Field(default=False, description="使用 Celery Worker 异步处理文档 (需要 Worker 运行中)")
+ # ── Perception crawl ──────────────────────────────────────────────────────
+ perception_crawl_timeout_seconds: int = Field(
+ default=120, description="HTTP timeout for regulatory source crawlers."
+ )
+ perception_max_events_per_source: int = Field(
+ default=100, description="Maximum events fetched per source per crawl run."
+ )
+ perception_diff_similarity_threshold: float = Field(
+ default=0.85,
+ description="Cosine similarity below which a paragraph is flagged as changed.",
+ )
+
# Keep configuration setup explicit so runtime behavior is easy to reason about.
api_host: str = Field(default="0.0.0.0", description="API服务地址")
api_port: int = Field(default=8000, description="API服务端口")
diff --git a/backend/app/infrastructure/perception/base_event_store.py b/backend/app/infrastructure/perception/base_event_store.py
new file mode 100644
index 0000000..2314424
--- /dev/null
+++ b/backend/app/infrastructure/perception/base_event_store.py
@@ -0,0 +1,39 @@
+"""Abstract base class for regulatory event stores."""
+
+from __future__ import annotations
+
+from abc import ABC, abstractmethod
+
+
+class BaseEventStore(ABC):
+ """Port interface for regulatory event persistence."""
+
+ @abstractmethod
+ def all(self) -> list[dict]:
+ """Return all events, most-recent first."""
+
+ @abstractmethod
+ def get(self, event_id: str) -> dict | None:
+ """Return a single event by ID, or None."""
+
+ @abstractmethod
+ def filter(
+ self,
+ *,
+ source: str | None = None,
+ impact_level: str | None = None,
+ limit: int = 50,
+ ) -> list[dict]:
+ """Return filtered events sorted by published_at descending."""
+
+ @abstractmethod
+ def stats(self) -> dict:
+ """Return {total, high_impact, medium_impact, low_impact, recent_90d}."""
+
+ @abstractmethod
+ def upsert(self, event: dict) -> None:
+ """Insert or update an event record."""
+
+ @abstractmethod
+ def get_by_standard_code(self, standard_code: str) -> dict | None:
+ """Return the most-recent event with matching standard_code, or None."""
diff --git a/backend/app/infrastructure/perception/crawlers/__init__.py b/backend/app/infrastructure/perception/crawlers/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/backend/app/infrastructure/perception/crawlers/_utils.py b/backend/app/infrastructure/perception/crawlers/_utils.py
new file mode 100644
index 0000000..d2f96b1
--- /dev/null
+++ b/backend/app/infrastructure/perception/crawlers/_utils.py
@@ -0,0 +1,43 @@
+"""Shared utility functions for crawlers."""
+
+from __future__ import annotations
+
+import re
+from datetime import date
+
+
+def parse_date(text: str) -> str:
+ """Return YYYY-MM-DD from common Chinese date formats, or today's date."""
+ text = text.strip()
+ if not text:
+ return date.today().isoformat()
+ m = re.search(r"(\d{4})[/-](\d{1,2})[/-](\d{1,2})", text)
+ if m:
+ try:
+ return date(int(m.group(1)), int(m.group(2)), int(m.group(3))).isoformat()
+ except ValueError:
+ pass
+ m2 = re.search(r"(\d{4})年(\d{1,2})月(\d{1,2})日?", text)
+ if m2:
+ try:
+ return date(int(m2.group(1)), int(m2.group(2)), int(m2.group(3))).isoformat()
+ except ValueError:
+ pass
+ return date.today().isoformat()
+
+
+def extract_tags(standard_code: str, title: str) -> list[str]:
+ """Derive simple keyword tags from standard code and title."""
+ tags: list[str] = []
+ code_upper = standard_code.upper()
+ if "GB" in code_upper:
+ tags.append("国家标准")
+ if "/T" in code_upper:
+ tags.append("推荐性")
+ else:
+ tags.append("强制性")
+ keywords = ["电动", "安全", "自动驾驶", "充电", "智能网联", "碰撞", "排放", "网络安全"]
+ for kw in keywords:
+ if kw in title:
+ tags.append(kw)
+ return tags[:5]
diff --git a/backend/app/infrastructure/perception/crawlers/base.py b/backend/app/infrastructure/perception/crawlers/base.py
new file mode 100644
index 0000000..b359a5c
--- /dev/null
+++ b/backend/app/infrastructure/perception/crawlers/base.py
@@ -0,0 +1,32 @@
+"""Shared contracts for regulatory source crawlers."""
+
+from __future__ import annotations
+
+from abc import ABC, abstractmethod
+from dataclasses import dataclass, field
+
+
+@dataclass
+class RawEvent:
+ """Raw regulatory event returned by a crawler before enrichment."""
+
+ source: str
+ source_label: str
+ standard_code: str
+ title: str
+ summary: str
+ full_text_url: str
+ status: str # 'enacted' | 'draft' | 'consultation'
+ published_at: str # YYYY-MM-DD string
+ effective_at: str | None
+ category: str
+ tags: list[str] = field(default_factory=list)
+ raw_text: str = "" # full crawled text for hashing + LLM
+
+
+class BaseCrawler(ABC):
+ """Abstract regulatory source crawler."""
+
+ @abstractmethod
+ def fetch(self, limit: int = 50) -> list[RawEvent]:
+ """Fetch up to `limit` recent events from the data source."""
diff --git a/backend/app/infrastructure/perception/crawlers/catarc_crawler.py b/backend/app/infrastructure/perception/crawlers/catarc_crawler.py
new file mode 100644
index 0000000..3ff5dd9
--- /dev/null
+++ b/backend/app/infrastructure/perception/crawlers/catarc_crawler.py
@@ -0,0 +1,83 @@
+"""Crawler for CATARC automotive standard catalogue."""
+
+from __future__ import annotations
+
+from urllib.parse import urljoin
+
+import httpx
+from bs4 import BeautifulSoup
+from loguru import logger
+
+from app.infrastructure.perception.crawlers.base import BaseCrawler, RawEvent
+from ._utils import extract_tags, parse_date
+
+_BASE_URL = "https://www.catarc.org.cn/bzzxd/qcbz/index.html"
+_HOST = "https://www.catarc.org.cn"
+
+_STATUS_MAP = {
+ "现行": "enacted",
+ "即将实施": "enacted",
+ "废止": "enacted",
+ "征求意见": "consultation",
+ "报批": "draft",
+}
+
+
+class CatarcCrawler(BaseCrawler):
+ """Scrape the CATARC automotive standard list page."""
+
+ def fetch(self, limit: int = 50) -> list[RawEvent]:
+ events: list[RawEvent] = []
+ page = 1
+ max_pages = max(10, limit)
+ while len(events) < limit and page <= max_pages:
+ url = f"{_BASE_URL}?page={page}"
+ try:
+ resp = httpx.get(url, timeout=30, follow_redirects=True)
+ resp.raise_for_status()
+ except Exception as exc:
+ logger.warning("CATARC fetch failed page={} err={}", page, exc)
+ break
+
+ soup = BeautifulSoup(resp.text, "lxml")
+ rows = soup.select("table tr")
+ if not rows:
+ break
+
+ batch: list[RawEvent] = []
+ for row in rows:
+ cells = row.find_all("td")
+ if len(cells) < 3:
+ continue
+ link = cells[0].find("a")
+ standard_code = link.get_text(strip=True) if link else cells[0].get_text(strip=True)
+ title = cells[1].get_text(strip=True) if len(cells) > 1 else standard_code
+ date_text = cells[2].get_text(strip=True) if len(cells) > 2 else ""
+ published_at = parse_date(date_text)
+ status_text = cells[3].get_text(strip=True) if len(cells) > 3 else ""
+ status = _STATUS_MAP.get(status_text, "enacted")
+ detail_url = urljoin(_HOST, link["href"]) if link and link.get("href") else url
+ raw_text = f"{standard_code} {title}"
+ batch.append(RawEvent(
+ source="CATARC",
+ source_label="全国汽车标准化技术委员会",
+ standard_code=standard_code,
+ title=title,
+ summary=title,
+ full_text_url=detail_url,
+ status=status,
+ published_at=published_at,
+ effective_at=None,
+ category="汽车标准",
+ tags=extract_tags(standard_code, title),
+ raw_text=raw_text,
+ ))
+
+ if not batch:
+ break
+ events.extend(batch)
+ page += 1
+
+ return events[:limit]
+
+
diff --git a/backend/app/infrastructure/perception/crawlers/eurlex_crawler.py b/backend/app/infrastructure/perception/crawlers/eurlex_crawler.py
new file mode 100644
index 0000000..3f5fdd2
--- /dev/null
+++ b/backend/app/infrastructure/perception/crawlers/eurlex_crawler.py
@@ -0,0 +1,117 @@
+"""Crawler for EUR-Lex RSS feeds covering EU AI Act and automotive regulations."""
+
+from __future__ import annotations
+
+import re
+from email.utils import parsedate_to_datetime
+
+import httpx
+from bs4 import BeautifulSoup
+from loguru import logger
+
+from app.infrastructure.perception.crawlers.base import BaseCrawler, RawEvent
+from ._utils import parse_date
+
+_EURLEX_RSS_URLS = [
+ "https://eur-lex.europa.eu/rss-feed/OJ-L.rss",
+]
+
+_AUTOMOTIVE_KEYWORDS = [
+ "vehicle", "automotive", "motor", "tyre", "emission", "ADAS", "autonomous",
+ "AI Act", "artificial intelligence", "cybersecurity", "software update",
+ "R155", "R156", "汽车", "车辆",
+]
+
+
+_AUTOMOTIVE_KEYWORDS_LOWER = [kw.lower() for kw in _AUTOMOTIVE_KEYWORDS]
+
+
+def _is_automotive_relevant(title: str, description: str) -> bool:
+ combined = (title + " " + description).lower()
+ return any(kw in combined for kw in _AUTOMOTIVE_KEYWORDS_LOWER)
+
+
+def _extract_celex(url: str) -> str:
+ m = re.search(r"CELEX[:/]([0-9A-Z]+)", url)
+ return m.group(1) if m else ""
+
+
+def _parse_rss_date(rfc2822: str) -> str:
+ try:
+ dt = parsedate_to_datetime(rfc2822)
+ return dt.date().isoformat()
+ except Exception:
+ return parse_date(rfc2822)
+
+
+class EurlexCrawler(BaseCrawler):
+ """Fetch automotive-relevant EU regulations from EUR-Lex RSS feeds."""
+
+ def fetch(self, limit: int = 50) -> list[RawEvent]:
+ events: list[RawEvent] = []
+ for rss_url in _EURLEX_RSS_URLS:
+ if len(events) >= limit:
+ break
+ try:
+ resp = httpx.get(rss_url, timeout=30, follow_redirects=True)
+ resp.raise_for_status()
+ except Exception as exc:
+ logger.warning("EUR-Lex RSS fetch failed url={} err={}", rss_url, exc)
+ continue
+
+ soup = BeautifulSoup(resp.content, "lxml-xml")
+ for item in soup.find_all("item"):
+ if len(events) >= limit:
+ break
+ title_tag = item.find("title")
+ title = title_tag.get_text(strip=True) if title_tag else ""
+ desc_tag = item.find("description")
+ description = desc_tag.get_text(strip=True) if desc_tag else ""
+ link_tag = item.find("link")
+ link = link_tag.get_text(strip=True) if link_tag else ""
+ pub_date_tag = item.find("pubDate")
+ pub_date = pub_date_tag.get_text(strip=True) if pub_date_tag else ""
+
+ if not _is_automotive_relevant(title, description):
+ continue
+
+ celex = _extract_celex(link)
+ standard_code = celex if celex else title[:60]
+ published_at = _parse_rss_date(pub_date) if pub_date else ""
+
+ events.append(RawEvent(
+ source="EUR-Lex",
+ source_label="欧盟官方公报",
+ standard_code=standard_code,
+ title=title,
+ summary=description[:500],
+ full_text_url=link,
+ status="enacted",
+ published_at=published_at,
+ effective_at=None,
+ category="EU法规",
+ tags=_extract_eurlex_tags(title, description),
+ raw_text=f"{title}\n{description}",
+ ))
+
+ return events[:limit]
+
+
+def _extract_eurlex_tags(title: str, description: str) -> list[str]:
+ combined = title + " " + description
+ tag_map = {
+ "AI Act": "EU AI Act",
+ "artificial intelligence": "EU AI Act",
+ "R155": "UN R155",
+ "R156": "UN R156",
+ "cybersecurity": "网络安全",
+ "emission": "排放",
+ "autonomous": "自动驾驶",
+ "ADAS": "ADAS",
+ }
+ combined_lower = combined.lower()
+ tags = []
+ for kw, tag in tag_map.items():
+ if kw.lower() in combined_lower:
+ tags.append(tag)
+ return tags[:5]
diff --git a/backend/app/infrastructure/perception/crawlers/guobiao_crawler.py b/backend/app/infrastructure/perception/crawlers/guobiao_crawler.py
new file mode 100644
index 0000000..77c5b7b
--- /dev/null
+++ b/backend/app/infrastructure/perception/crawlers/guobiao_crawler.py
@@ -0,0 +1,92 @@
+"""Crawlers for the 国标委 (SAMR) standard information platform."""
+
+from __future__ import annotations
+
+import httpx
+from loguru import logger
+
+from app.infrastructure.perception.crawlers.base import BaseCrawler, RawEvent
+from ._utils import extract_tags, parse_date
+
+_BASE_URL = "https://openstd.samr.gov.cn/bzgk/std/std_list_type"
+_HEADERS = {"User-Agent": "Mozilla/5.0 (compatible; RegulatoryBot/1.0)"}
+
+
+def _fetch_page(std_type: int, page: int, page_size: int) -> list[dict]:
+ params = {
+ "p.p1": std_type,
+ "p.p2": "车",
+ "p.p90": "circulation_date",
+ "p.p91": "desc",
+ "p.p6": page,
+ "p.p7": page_size,
+ }
+ try:
+ resp = httpx.get(_BASE_URL, params=params, headers=_HEADERS, timeout=30)
+ resp.raise_for_status()
+ data = resp.json()
+ return data.get("rows", []) or []
+ except Exception as exc:
+ logger.warning("国标委 fetch failed type={} page={} err={}", std_type, page, exc)
+ return []
+
+
+def _row_to_raw_event(row: dict, source_label: str) -> RawEvent:
+ standard_code = row.get("std_code", "")
+ title = row.get("std_name", standard_code)
+ published_at = parse_date(row.get("release_date", ""))
+ effective_at_raw = row.get("implement_date", "")
+ effective_at = parse_date(effective_at_raw) if effective_at_raw else None
+ status_text = row.get("std_status", "")
+ if "征求意见" in status_text:
+ status = "consultation"
+ elif "报批" in status_text or "草案" in status_text:
+ status = "draft"
+ else:
+ status = "enacted"
+ return RawEvent(
+ source="国标委",
+ source_label=source_label,
+ standard_code=standard_code,
+ title=title,
+ summary=title,
+ full_text_url=f"https://openstd.samr.gov.cn/bzgk/std/detail?id={row.get('id', '')}",
+ status=status,
+ published_at=published_at,
+ effective_at=effective_at,
+ category=row.get("std_type", "国家标准"),
+ tags=extract_tags(standard_code, title),
+ raw_text=f"{standard_code} {title}",
+ )
+
+
+class GuobiaoMandatoryCrawler(BaseCrawler):
+ """Fetch mandatory national standards (强制性) related to vehicles."""
+
+ def fetch(self, limit: int = 50) -> list[RawEvent]:
+ events: list[RawEvent] = []
+ page = 1
+ max_pages = max(10, limit)
+ while len(events) < limit and page <= max_pages:
+ rows = _fetch_page(std_type=1, page=page, page_size=20)
+ if not rows:
+ break
+ events.extend(_row_to_raw_event(r, "国标委·强制性") for r in rows)
+ page += 1
+ return events[:limit]
+
+
+class GuobiaoRecommendedCrawler(BaseCrawler):
+ """Fetch recommended national standards (推荐性) related to vehicles."""
+
+ def fetch(self, limit: int = 50) -> list[RawEvent]:
+ events: list[RawEvent] = []
+ page = 1
+ max_pages = max(10, limit)
+ while len(events) < limit and page <= max_pages:
+ rows = _fetch_page(std_type=2, page=page, page_size=20)
+ if not rows:
+ break
+ events.extend(_row_to_raw_event(r, "国标委·推荐性") for r in rows)
+ page += 1
+ return events[:limit]
diff --git a/backend/app/infrastructure/perception/llm_pipeline.py b/backend/app/infrastructure/perception/llm_pipeline.py
new file mode 100644
index 0000000..37cdce5
--- /dev/null
+++ b/backend/app/infrastructure/perception/llm_pipeline.py
@@ -0,0 +1,241 @@
+"""LLM-driven pipeline for regulatory event enrichment."""
+
+from __future__ import annotations
+
+import json
+import math
+from typing import Any
+
+from loguru import logger
+
+from app.config.settings import settings
+from app.infrastructure.embedding.openai_compatible_embedding_provider import (
+ OpenAICompatibleEmbeddingProvider,
+)
+from app.services.llm.llm_factory import get_llm_client
+
+_EXTRACT_SYSTEM = (
+ "You are a regulatory compliance expert specialising in automotive standards "
+ "(GB, UN-ECE, ISO, EU). Extract structured information from regulation text. "
+ "Return valid JSON only — no markdown fences, no extra keys."
+)
+
+_ASSESS_SYSTEM = (
+ "You are an automotive compliance analyst. Given a regulation and related document excerpts, "
+ "identify which documents are affected and what actions are required. "
+ "Return a JSON array only."
+)
+
+_DIFF_SYSTEM = (
+ "You are a regulatory change analyst. Given an old and new version of a regulation paragraph, "
+ "classify the type of change and summarise it. "
+ "Return JSON only: {\"change_type\": \"tightened|relaxed|added|removed\", \"summary\": \"...\"}"
+)
+
+_SIMILARITY_THRESHOLD = 0.85
+
+
+def _cosine(a: list[float], b: list[float]) -> float:
+ dot = sum(x * y for x, y in zip(a, b))
+ norm_a = math.sqrt(sum(x * x for x in a))
+ norm_b = math.sqrt(sum(x * x for x in b))
+ if norm_a == 0 or norm_b == 0:
+ return 0.0
+ return dot / (norm_a * norm_b)
+
+
+def _llm_json(client: Any, messages: list[dict]) -> Any:
+ """Call LLM and parse JSON response; return None on failure."""
+ try:
+ resp = client.chat(messages)
+ text = (resp.content or "").strip()
+ if text.startswith("```"):
+ text = text.split("```")[1]
+ if text.startswith("json"):
+ text = text[4:]
+ return json.loads(text)
+ except Exception as exc:
+ logger.warning("LLM JSON parse failed: {}", exc)
+ return None
+
+
+class LlmPipeline:
+ """Three-step enrichment pipeline for crawled regulatory events."""
+
+ def __init__(self) -> None:
+ self._client = get_llm_client(
+ provider=settings.llm_provider,
+ model=settings.llm_model,
+ )
+ self._embedder = OpenAICompatibleEmbeddingProvider()
+
+ # ------------------------------------------------------------------
+ # Step 1: Structure extraction
+ # ------------------------------------------------------------------
+
+ def extract_structure(self, event: dict) -> dict:
+ """Extract obligations, deadlines, scope, penalties, impact_level from event text."""
+ prompt = f"""Extract structured compliance information from this regulation:
+
+Standard: {event.get('standard_code', '')}
+Title: {event.get('title', '')}
+Source: {event.get('source_label', '')}
+Summary: {event.get('summary', '')}
+Tags: {', '.join(event.get('tags') or [])}
+
+Return JSON with exactly these keys:
+{{
+ "obligations": [{{"text": "...", "deontic": "must|shall|may|prohibited", "subject": "...", "object": "...", "condition": ""}}],
+ "deadlines": [{{"date": "YYYY-MM-DD or null", "description": "..."}}],
+ "scope": "one sentence describing who/what this applies to",
+ "penalties": "one sentence on consequences of non-compliance, or null",
+ "impact_level": "high|medium|low"
+}}"""
+
+ messages = [
+ {"role": "system", "content": _EXTRACT_SYSTEM},
+ {"role": "user", "content": prompt},
+ ]
+ result = _llm_json(self._client, messages)
+ if not isinstance(result, dict):
+ return {
+ "obligations": [],
+ "deadlines": [],
+ "scope": "",
+ "penalties": "",
+ "impact_level": "medium",
+ }
+ return result
+
+ # ------------------------------------------------------------------
+ # Step 2: Impact assessment
+ # ------------------------------------------------------------------
+
+ def assess_impact(self, event: dict, retrieval_service: Any) -> list[dict]:
+ """Use RAG to find affected documents and generate recommendations."""
+ obligations = event.get("obligations") or []
+ obligation_texts = " ".join(o.get("text", "") for o in obligations[:3])
+ query = f"{event.get('standard_code', '')} {event.get('title', '')} {obligation_texts}"
+
+ try:
+ chunks = retrieval_service.retrieve(query=query, top_k=5)
+ except Exception as exc:
+ logger.warning("RAG retrieval failed: {}", exc)
+ return []
+
+ if not chunks:
+ return []
+
+ seen: set[str] = set()
+ doc_excerpts: list[dict] = []
+ for chunk in chunks:
+ if chunk.doc_id not in seen:
+ seen.add(chunk.doc_id)
+ doc_excerpts.append({
+ "doc_id": chunk.doc_id,
+ "doc_name": chunk.doc_title,
+ "score": round(float(chunk.score if chunk.score is not None else 0), 4),
+ "snippet": (chunk.text or "")[:300],
+ "clause": getattr(chunk, "section_title", "") or "",
+ })
+
+ context = "\n".join(
+ f"[{d['doc_name']} {d['clause']}] score={d['score']}: {d['snippet']}"
+ for d in doc_excerpts
+ )
+ prompt = f"""Regulation: {event.get('standard_code')} — {event.get('title')}
+Obligations: {obligation_texts or event.get('summary', '')}
+
+Affected documents found in knowledge base:
+{context}
+
+For each document, assess impact and recommend action. Return JSON array:
+[{{"doc_id":"...","doc_name":"...","score":0.0,"key_clauses":"...","recommendation":"one sentence action"}}]"""
+
+ messages = [
+ {"role": "system", "content": _ASSESS_SYSTEM},
+ {"role": "user", "content": prompt},
+ ]
+ result = _llm_json(self._client, messages)
+ if isinstance(result, list):
+ score_map = {d["doc_id"]: d["score"] for d in doc_excerpts}
+ for item in result:
+ if isinstance(item, dict) and item.get("doc_id") in score_map:
+ item["score"] = score_map[item["doc_id"]]
+ return result
+ return doc_excerpts
+
+ # ------------------------------------------------------------------
+ # Step 3: Semantic diff
+ # ------------------------------------------------------------------
+
+ def compute_diff(self, old_text: str, new_text: str) -> dict:
+ """Compare old and new regulation text; return changed sections and summary."""
+ old_paras = [p.strip() for p in old_text.split("\n") if p.strip()]
+ new_paras = [p.strip() for p in new_text.split("\n") if p.strip()]
+
+ if not old_paras or not new_paras:
+ return {"changed_sections": [], "change_summary": "No comparable text."}
+
+ all_paras = old_paras + new_paras
+ try:
+ all_embeddings = self._embedder.embed_texts(all_paras)
+ except Exception as exc:
+ logger.warning("Embedding for diff failed: {}", exc)
+ return {"changed_sections": [], "change_summary": "Diff unavailable (embedding error)."}
+
+ old_embeddings = all_embeddings[: len(old_paras)]
+ new_embeddings = all_embeddings[len(old_paras):]
+
+ changed_sections: list[dict] = []
+ max_len = max(len(old_paras), len(new_paras))
+
+ for i in range(max_len):
+ if i >= len(old_paras):
+ # New paragraph added
+ changed_sections.append({
+ "old_text": "",
+ "new_text": new_paras[i][:300],
+ "similarity": 0.0,
+ "change_type": "added",
+ "summary": "New paragraph added.",
+ })
+ continue
+ if i >= len(new_paras):
+ # Old paragraph removed
+ changed_sections.append({
+ "old_text": old_paras[i][:300],
+ "new_text": "",
+ "similarity": 0.0,
+ "change_type": "removed",
+ "summary": "Paragraph removed.",
+ })
+ continue
+ # Both exist — compare via embeddings
+ sim = _cosine(old_embeddings[i], new_embeddings[i])
+ if sim < _SIMILARITY_THRESHOLD:
+ messages = [
+ {"role": "system", "content": _DIFF_SYSTEM},
+ {"role": "user", "content": f"OLD: {old_paras[i][:500]}\nNEW: {new_paras[i][:500]}"},
+ ]
+ classification = _llm_json(self._client, messages) or {}
+ changed_sections.append({
+ "old_text": old_paras[i][:300],
+ "new_text": new_paras[i][:300],
+ "similarity": round(sim, 3),
+ "change_type": classification.get("change_type", "modified"),
+ "summary": classification.get("summary", ""),
+ })
+
+ if not changed_sections:
+ change_summary = "No substantive changes detected between versions."
+ else:
+ types = [s["change_type"] for s in changed_sections]
+ change_summary = (
+ f"{len(changed_sections)} paragraph(s) changed: "
+ + ", ".join(f"{t}" for t in set(types))
+ + ". "
+ + (changed_sections[0].get("summary", "") if changed_sections else "")
+ )
+
+ return {"changed_sections": changed_sections, "change_summary": change_summary}
diff --git a/backend/app/infrastructure/perception/mock_event_store.py b/backend/app/infrastructure/perception/mock_event_store.py
index a927cee..71a8e60 100644
--- a/backend/app/infrastructure/perception/mock_event_store.py
+++ b/backend/app/infrastructure/perception/mock_event_store.py
@@ -4,6 +4,8 @@ from __future__ import annotations
from typing import Any
+from app.infrastructure.perception.base_event_store import BaseEventStore
+
MOCK_EVENTS: list[dict[str, Any]] = [
# ------------------------------------------------------------------ HIGH
{
@@ -379,18 +381,18 @@ MOCK_EVENTS: list[dict[str, Any]] = [
},
]
-# Index for fast lookup
-_EVENT_INDEX: dict[str, dict] = {e["id"]: e for e in MOCK_EVENTS}
-
-
-class MockEventStore:
+class MockEventStore(BaseEventStore):
"""In-memory mock store for regulatory events."""
+ def __init__(self) -> None:
+ self._events: list[dict] = [dict(e) for e in MOCK_EVENTS]
+ self._index: dict[str, dict] = {e["id"]: e for e in self._events}
+
def all(self) -> list[dict]:
- return list(MOCK_EVENTS)
+ return list(self._events)
def get(self, event_id: str) -> dict | None:
- return _EVENT_INDEX.get(event_id)
+ return self._index.get(event_id)
def filter(
self,
@@ -399,23 +401,39 @@ class MockEventStore:
impact_level: str | None = None,
limit: int = 50,
) -> list[dict]:
- events = list(MOCK_EVENTS)
+ events = list(self._events)
if source:
events = [e for e in events if e["source"] == source]
if impact_level:
events = [e for e in events if e["impact_level"] == impact_level]
- events.sort(key=lambda e: e["published_at"], reverse=True)
+ events.sort(key=lambda e: e.get("published_at") or "", reverse=True)
return events[:limit]
def stats(self) -> dict:
from datetime import date, timedelta
- events = MOCK_EVENTS
+ events = self._events
cutoff = (date.today() - timedelta(days=90)).isoformat()
return {
"total": len(events),
"high_impact": sum(1 for e in events if e["impact_level"] == "high"),
"medium_impact": sum(1 for e in events if e["impact_level"] == "medium"),
"low_impact": sum(1 for e in events if e["impact_level"] == "low"),
- "recent_90d": sum(1 for e in events if e["published_at"] >= cutoff),
+ "recent_90d": sum(1 for e in events if (e.get("published_at") or "") >= cutoff),
}
+
+ def upsert(self, event: dict) -> None:
+ """Insert or update event in the in-memory list (used in tests)."""
+ existing = self._index.get(event["id"])
+ if existing:
+ existing.update(event)
+ else:
+ self._events.append(event)
+ self._index[event["id"]] = event
+
+ def get_by_standard_code(self, standard_code: str) -> dict | None:
+ """Return most-recent event with matching standard_code."""
+ matches = [e for e in self._events if e.get("standard_code") == standard_code]
+ if not matches:
+ return None
+ return max(matches, key=lambda e: e.get("published_at", ""))
diff --git a/backend/app/infrastructure/perception/postgres_event_store.py b/backend/app/infrastructure/perception/postgres_event_store.py
new file mode 100644
index 0000000..4782ae0
--- /dev/null
+++ b/backend/app/infrastructure/perception/postgres_event_store.py
@@ -0,0 +1,225 @@
+"""PostgreSQL-backed regulatory event store."""
+
+from __future__ import annotations
+
+import json
+from contextlib import contextmanager
+from datetime import UTC, date, datetime, timedelta
+from typing import Any
+
+import psycopg2
+import psycopg2.extras
+from psycopg2.pool import ThreadedConnectionPool
+
+from app.config.settings import settings
+from app.infrastructure.perception.base_event_store import BaseEventStore
+
+_CREATE_TABLE = """
+CREATE TABLE IF NOT EXISTS regulation_events (
+ id TEXT PRIMARY KEY,
+ source TEXT NOT NULL,
+ source_label TEXT,
+ standard_code TEXT NOT NULL,
+ title TEXT NOT NULL,
+ summary TEXT,
+ full_text_url TEXT,
+ status TEXT,
+ impact_level TEXT,
+ published_at DATE,
+ effective_at DATE,
+ category TEXT,
+ tags TEXT[],
+ obligations JSONB,
+ deadlines JSONB,
+ scope TEXT,
+ penalties TEXT,
+ content_hash TEXT,
+ previous_hash TEXT,
+ change_summary TEXT,
+ changed_sections JSONB,
+ affected_docs JSONB,
+ crawled_at TIMESTAMPTZ DEFAULT now(),
+ processed_at TIMESTAMPTZ,
+ raw_storage_key TEXT
+);
+CREATE INDEX IF NOT EXISTS reg_events_source_date
+ ON regulation_events (source, published_at DESC);
+CREATE INDEX IF NOT EXISTS reg_events_impact_date
+ ON regulation_events (impact_level, published_at DESC);
+"""
+
+_ALL_COLUMNS = (
+ "id", "source", "source_label", "standard_code", "title", "summary",
+ "full_text_url", "status", "impact_level", "published_at", "effective_at",
+ "category", "tags", "obligations", "deadlines", "scope", "penalties",
+ "content_hash", "previous_hash", "change_summary", "changed_sections",
+ "affected_docs", "crawled_at", "processed_at", "raw_storage_key",
+)
+
+
+def _row_to_dict(row: dict[str, Any]) -> dict:
+ """Convert a psycopg2 RealDictRow to a plain dict with serialized JSON fields."""
+ d = dict(row)
+ for field in ("obligations", "deadlines", "changed_sections", "affected_docs"):
+ val = d.get(field)
+ if isinstance(val, str):
+ d[field] = json.loads(val)
+ for date_field in ("published_at", "effective_at"):
+ val = d.get(date_field)
+ if isinstance(val, datetime):
+ d[date_field] = val.date().isoformat()
+ elif isinstance(val, date):
+ d[date_field] = val.isoformat()
+ for ts_field in ("crawled_at", "processed_at"):
+ val = d.get(ts_field)
+ if isinstance(val, datetime):
+ d[ts_field] = val.isoformat()
+ return d
+
+
+class PostgresEventStore(BaseEventStore):
+ """Regulatory event store backed by PostgreSQL."""
+
+ def __init__(self) -> None:
+ self._pool = ThreadedConnectionPool(
+ minconn=1,
+ maxconn=5,
+ host=settings.postgres_host,
+ port=settings.postgres_port,
+ user=settings.postgres_user,
+ password=settings.postgres_password,
+ dbname=settings.postgres_db,
+ )
+ self._ensure_schema()
+
+ def _ensure_schema(self) -> None:
+ with self._conn() as conn:
+ try:
+ with conn.cursor() as cur:
+ cur.execute(_CREATE_TABLE)
+ conn.commit()
+ except Exception:
+ conn.rollback()
+ raise
+
+ @contextmanager
+ def _conn(self):
+ conn = None
+ try:
+ conn = self._pool.getconn()
+ yield conn
+ finally:
+ if conn is not None:
+ self._pool.putconn(conn)
+
+ def all(self) -> list[dict]:
+ with self._conn() as conn:
+ with conn.cursor(cursor_factory=psycopg2.extras.RealDictCursor) as cur:
+ cur.execute(
+ "SELECT * FROM regulation_events ORDER BY published_at DESC NULLS LAST"
+ )
+ return [_row_to_dict(r) for r in cur.fetchall()]
+
+ def get(self, event_id: str) -> dict | None:
+ with self._conn() as conn:
+ with conn.cursor(cursor_factory=psycopg2.extras.RealDictCursor) as cur:
+ cur.execute(
+ "SELECT * FROM regulation_events WHERE id = %s", (event_id,)
+ )
+ row = cur.fetchone()
+ return _row_to_dict(row) if row else None
+
+ def filter(
+ self,
+ *,
+ source: str | None = None,
+ impact_level: str | None = None,
+ limit: int = 50,
+ ) -> list[dict]:
+ conditions: list[str] = []
+ params: list[Any] = []
+ if source:
+ conditions.append("source = %s")
+ params.append(source)
+ if impact_level:
+ conditions.append("impact_level = %s")
+ params.append(impact_level)
+ where = ("WHERE " + " AND ".join(conditions)) if conditions else ""
+ params.append(limit)
+ sql = f"""
+ SELECT * FROM regulation_events
+ {where}
+ ORDER BY published_at DESC NULLS LAST
+ LIMIT %s
+ """
+ with self._conn() as conn:
+ with conn.cursor(cursor_factory=psycopg2.extras.RealDictCursor) as cur:
+ cur.execute(sql, params)
+ return [_row_to_dict(r) for r in cur.fetchall()]
+
+ def stats(self) -> dict:
+ cutoff = (date.today() - timedelta(days=90)).isoformat()
+ with self._conn() as conn:
+ with conn.cursor(cursor_factory=psycopg2.extras.RealDictCursor) as cur:
+ cur.execute("SELECT COUNT(*) AS count FROM regulation_events")
+ total = (cur.fetchone() or {}).get("count", 0)
+ cur.execute(
+ "SELECT COUNT(*) AS count FROM regulation_events WHERE impact_level = 'high'"
+ )
+ high = (cur.fetchone() or {}).get("count", 0)
+ cur.execute(
+ "SELECT COUNT(*) AS count FROM regulation_events WHERE impact_level = 'medium'"
+ )
+ medium = (cur.fetchone() or {}).get("count", 0)
+ cur.execute(
+ "SELECT COUNT(*) AS count FROM regulation_events WHERE published_at >= %s",
+ (cutoff,),
+ )
+ recent = (cur.fetchone() or {}).get("count", 0)
+ return {
+ "total": int(total),
+ "high_impact": int(high),
+ "medium_impact": int(medium),
+ "recent_90d": int(recent),
+ }
+
+ def upsert(self, event: dict) -> None:
+ """Insert or update a regulation event."""
+ cols = [c for c in _ALL_COLUMNS if c in event]
+ placeholders = ", ".join(f"%({c})s" for c in cols)
+ updates = ", ".join(f"{c} = EXCLUDED.{c}" for c in cols if c != "id")
+ sql = f"""
+ INSERT INTO regulation_events ({', '.join(cols)})
+ VALUES ({placeholders})
+ ON CONFLICT (id) DO UPDATE SET {updates}
+ """
+ row: dict[str, Any] = {}
+ for c in cols:
+ val = event.get(c)
+ if c in ("obligations", "deadlines", "changed_sections", "affected_docs") and val is not None:
+ row[c] = json.dumps(val, ensure_ascii=False)
+ elif c == "tags" and isinstance(val, list):
+ row[c] = val
+ else:
+ row[c] = val
+ with self._conn() as conn:
+ try:
+ with conn.cursor() as cur:
+ cur.execute(sql, row)
+ conn.commit()
+ except Exception:
+ conn.rollback()
+ raise
+
+ def get_by_standard_code(self, standard_code: str) -> dict | None:
+ with self._conn() as conn:
+ with conn.cursor(cursor_factory=psycopg2.extras.RealDictCursor) as cur:
+ cur.execute(
+ """SELECT * FROM regulation_events
+ WHERE standard_code = %s
+ ORDER BY published_at DESC NULLS LAST
+ LIMIT 1""",
+ (standard_code,),
+ )
+ row = cur.fetchone()
+ return _row_to_dict(row) if row else None
diff --git a/backend/app/shared/bootstrap.py b/backend/app/shared/bootstrap.py
index 7821924..1f2d981 100644
--- a/backend/app/shared/bootstrap.py
+++ b/backend/app/shared/bootstrap.py
@@ -19,6 +19,15 @@ from app.infrastructure.parser.local_chunk_builder import LocalRegulationChunkBu
from app.infrastructure.parser.local_document_parser import LocalDocumentParser
from app.infrastructure.parser.vector_chunk_builder import AliyunVectorChunkBuilder
from app.infrastructure.perception.mock_event_store import MockEventStore
+from app.application.perception.crawl_service import CrawlService
+from app.infrastructure.perception.base_event_store import BaseEventStore
+from app.infrastructure.perception.crawlers.catarc_crawler import CatarcCrawler
+from app.infrastructure.perception.crawlers.guobiao_crawler import (
+ GuobiaoMandatoryCrawler,
+ GuobiaoRecommendedCrawler,
+)
+from app.infrastructure.perception.crawlers.eurlex_crawler import EurlexCrawler
+from app.infrastructure.perception.llm_pipeline import LlmPipeline
from app.infrastructure.session.in_memory_conversation_store import InMemoryConversationStore
from app.infrastructure.storage.json_document_processing_store import JsonDocumentProcessingStore
from app.infrastructure.storage.json_document_repository import JsonDocumentRepository
@@ -293,11 +302,35 @@ def get_agent_conversation_service() -> AgentConversationService:
)
+@lru_cache
+def get_event_store() -> BaseEventStore:
+ """Return event store selected by DOCUMENT_REPOSITORY_BACKEND setting."""
+ if settings.document_repository_backend == "postgres":
+ from app.infrastructure.perception.postgres_event_store import PostgresEventStore
+ return PostgresEventStore()
+ return MockEventStore()
+
+
@lru_cache
def get_perception_service() -> PerceptionService:
- """Return perception service for regulatory intelligence."""
return PerceptionService(
- event_store=MockEventStore(),
+ event_store=get_event_store(),
+ retrieval_service=get_retrieval_service(),
+ )
+
+
+@lru_cache
+def get_crawl_service() -> CrawlService:
+ crawlers = {
+ "CATARC": CatarcCrawler(),
+ "国标委·强制性": GuobiaoMandatoryCrawler(),
+ "国标委·推荐性": GuobiaoRecommendedCrawler(),
+ "EUR-Lex": EurlexCrawler(),
+ }
+ return CrawlService(
+ crawlers=crawlers,
+ event_store=get_event_store(),
+ llm_pipeline=LlmPipeline(),
retrieval_service=get_retrieval_service(),
)
diff --git a/backend/requirements.txt b/backend/requirements.txt
index b75a8f0..5150ad0 100644
--- a/backend/requirements.txt
+++ b/backend/requirements.txt
@@ -9,6 +9,8 @@ pydantic-settings>=2.0.0
python-dotenv>=1.0.0
loguru>=0.7.0
httpx>=0.25.0
+beautifulsoup4>=4.12.0
+lxml>=5.0.0
tiktoken>=0.5.0
tenacity>=8.2.0
diff --git a/backend/tests/perception/__init__.py b/backend/tests/perception/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/backend/tests/perception/test_base_event_store.py b/backend/tests/perception/test_base_event_store.py
new file mode 100644
index 0000000..ebc4e1d
--- /dev/null
+++ b/backend/tests/perception/test_base_event_store.py
@@ -0,0 +1,95 @@
+"""Contract tests: any BaseEventStore implementation must pass these."""
+from app.infrastructure.perception.base_event_store import BaseEventStore
+from app.infrastructure.perception.mock_event_store import MockEventStore
+
+
+def _store() -> BaseEventStore:
+ return MockEventStore()
+
+
+def test_is_base_event_store():
+ assert isinstance(_store(), BaseEventStore)
+
+
+def test_all_returns_list():
+ result = _store().all()
+ assert isinstance(result, list)
+ assert len(result) > 0
+
+
+def test_get_known_id():
+ store = _store()
+ first = store.all()[0]
+ result = store.get(first["id"])
+ assert result is not None
+ assert result["id"] == first["id"]
+
+
+def test_get_unknown_returns_none():
+ assert _store().get("does-not-exist") is None
+
+
+def test_filter_by_impact():
+ store = _store()
+ highs = store.filter(impact_level="high", limit=100)
+ assert all(e["impact_level"] == "high" for e in highs)
+
+
+def test_filter_limit():
+ store = _store()
+ result = store.filter(limit=3)
+ assert len(result) <= 3
+
+
+def test_stats_keys():
+ stats = _store().stats()
+ for key in ("total", "high_impact", "medium_impact", "recent_90d"):
+ assert key in stats, f"missing key: {key}"
+
+
+def test_upsert_and_get():
+ store = _store()
+ event = {
+ "id": "test-upsert-001",
+ "source": "TEST",
+ "source_label": "Test Source",
+ "standard_code": "TST-001",
+ "title": "Test Event",
+ "summary": "A test event",
+ "full_text_url": "https://example.com",
+ "status": "draft",
+ "impact_level": "low",
+ "published_at": "2026-01-01",
+ "effective_at": None,
+ "category": "test",
+ "tags": ["test"],
+ "content_hash": "abc123",
+ "previous_hash": None,
+ }
+ store.upsert(event)
+ result = store.get("test-upsert-001")
+ assert result is not None
+ assert result["title"] == "Test Event"
+
+
+def test_get_by_standard_code():
+ store = _store()
+ first = store.all()[0]
+ result = store.get_by_standard_code(first["standard_code"])
+ assert result is not None
+ assert result["standard_code"] == first["standard_code"]
+
+
+def test_upsert_updates_existing():
+ store = _store()
+ first = store.all()[0]
+ original_id = first["id"]
+ store.upsert({"id": original_id, "title": "Updated Title", "impact_level": first["impact_level"],
+ "standard_code": first.get("standard_code", ""), "source": first["source"],
+ "source_label": first.get("source_label", ""), "summary": "Updated",
+ "full_text_url": "", "status": first["status"], "published_at": first.get("published_at", ""),
+ "effective_at": None, "category": first.get("category", ""), "tags": [],
+ "content_hash": "newhash", "previous_hash": None})
+ result = store.get(original_id)
+ assert result is not None
+ assert result["title"] == "Updated Title"
diff --git a/backend/tests/perception/test_crawl_service.py b/backend/tests/perception/test_crawl_service.py
new file mode 100644
index 0000000..50dec69
--- /dev/null
+++ b/backend/tests/perception/test_crawl_service.py
@@ -0,0 +1,111 @@
+"""Integration tests for CrawlService."""
+from __future__ import annotations
+from unittest.mock import MagicMock
+import hashlib
+import pytest
+
+from app.infrastructure.perception.crawlers.base import RawEvent
+from app.infrastructure.perception.mock_event_store import MockEventStore
+
+
+def _make_raw_event(code="TST-001"):
+ return RawEvent(
+ source="TEST", source_label="Test", standard_code=code,
+ title=f"Test {code}", summary="Summary", full_text_url="https://example.com",
+ status="enacted", published_at="2026-01-01", effective_at=None,
+ category="test", tags=["test"], raw_text="full text",
+ )
+
+
+def _make_service(raw_events):
+ from app.application.perception.crawl_service import CrawlService
+
+ mock_crawler = MagicMock()
+ mock_crawler.fetch.return_value = raw_events
+
+ mock_pipeline = MagicMock()
+ mock_pipeline.extract_structure.return_value = {
+ "obligations": [], "deadlines": [], "scope": "test",
+ "penalties": None, "impact_level": "low",
+ }
+ mock_pipeline.assess_impact.return_value = []
+ mock_pipeline.compute_diff.return_value = {
+ "changed_sections": [], "change_summary": "No changes.",
+ }
+
+ mock_retrieval = MagicMock()
+ store = MockEventStore()
+
+ return CrawlService(
+ crawlers={"TEST": mock_crawler},
+ event_store=store,
+ llm_pipeline=mock_pipeline,
+ retrieval_service=mock_retrieval,
+ )
+
+
+def test_crawl_yields_progress_and_done():
+ svc = _make_service([_make_raw_event("TST-001")])
+ events = list(svc.run_crawl())
+ event_types = [e.get("event") for e in events]
+ assert "done" in event_types
+
+
+def test_crawl_upserts_to_store():
+ store = MockEventStore()
+ from app.application.perception.crawl_service import CrawlService
+ mock_crawler = MagicMock()
+ mock_crawler.fetch.return_value = [_make_raw_event("NEW-001")]
+ mock_pipeline = MagicMock()
+ mock_pipeline.extract_structure.return_value = {
+ "obligations": [], "deadlines": [], "scope": "",
+ "penalties": None, "impact_level": "medium",
+ }
+ mock_pipeline.assess_impact.return_value = []
+ mock_pipeline.compute_diff.return_value = {
+ "changed_sections": [], "change_summary": "",
+ }
+ svc = CrawlService(
+ crawlers={"TEST": mock_crawler},
+ event_store=store,
+ llm_pipeline=mock_pipeline,
+ retrieval_service=MagicMock(),
+ )
+ list(svc.run_crawl())
+ result = store.get_by_standard_code("NEW-001")
+ assert result is not None
+ assert result["title"] == "Test NEW-001"
+
+
+def test_crawl_skips_unchanged_events():
+ store = MockEventStore()
+ raw = _make_raw_event("SKIP-001")
+ content_hash = hashlib.sha256(raw.raw_text.encode()).hexdigest()
+ store.upsert({
+ "id": hashlib.sha256(f"TEST-SKIP-001".encode()).hexdigest()[:12],
+ "standard_code": "SKIP-001",
+ "source": "TEST",
+ "source_label": "Test",
+ "title": "Test SKIP-001",
+ "summary": "",
+ "full_text_url": "",
+ "status": "enacted",
+ "impact_level": "low",
+ "published_at": "2026-01-01",
+ "effective_at": None,
+ "category": "test",
+ "tags": [],
+ "content_hash": content_hash,
+ })
+ mock_pipeline = MagicMock()
+ from app.application.perception.crawl_service import CrawlService
+ mock_crawler = MagicMock()
+ mock_crawler.fetch.return_value = [raw]
+ svc = CrawlService(
+ crawlers={"TEST": mock_crawler},
+ event_store=store,
+ llm_pipeline=mock_pipeline,
+ retrieval_service=MagicMock(),
+ )
+ list(svc.run_crawl())
+ mock_pipeline.extract_structure.assert_not_called()
diff --git a/backend/tests/perception/test_crawlers.py b/backend/tests/perception/test_crawlers.py
new file mode 100644
index 0000000..2aa43b6
--- /dev/null
+++ b/backend/tests/perception/test_crawlers.py
@@ -0,0 +1,127 @@
+"""Unit tests for crawlers — mock httpx responses."""
+from __future__ import annotations
+from unittest.mock import MagicMock, patch
+import pytest
+
+from app.infrastructure.perception.crawlers.base import RawEvent, BaseCrawler
+
+
+def test_raw_event_fields():
+ ev = RawEvent(
+ source="TEST",
+ source_label="Test",
+ standard_code="TST-001",
+ title="Test",
+ summary="Summary",
+ full_text_url="https://example.com",
+ status="enacted",
+ published_at="2026-01-01",
+ effective_at=None,
+ category="test",
+ tags=["a"],
+ raw_text="full text here",
+ )
+ assert ev.source == "TEST"
+ assert ev.tags == ["a"]
+
+
+CATARC_HTML = """
+
+
+
+"""
+
+
+def test_catarc_crawler_parses_html():
+ from app.infrastructure.perception.crawlers.catarc_crawler import CatarcCrawler
+
+ mock_resp = MagicMock()
+ mock_resp.status_code = 200
+ mock_resp.text = CATARC_HTML
+ mock_resp.raise_for_status = MagicMock()
+
+ with patch("httpx.get", return_value=mock_resp):
+ crawler = CatarcCrawler()
+ events = crawler.fetch(limit=10)
+
+ assert isinstance(events, list)
+ assert len(events) >= 1
+ assert all(isinstance(e, RawEvent) for e in events)
+ codes = [e.standard_code for e in events]
+ assert "GB 18384-2025" in codes
+
+
+GUOBIAO_JSON = {
+ "rows": [
+ {
+ "std_code": "GB 18384-2025",
+ "std_name": "电动汽车安全要求",
+ "release_date": "2025-11-15",
+ "implement_date": "2026-07-01",
+ "std_status": "现行",
+ "std_type": "强制性",
+ },
+ ]
+}
+
+
+def test_guobiao_crawler_parses_json():
+ from app.infrastructure.perception.crawlers.guobiao_crawler import GuobiaoMandatoryCrawler
+
+ mock_resp = MagicMock()
+ mock_resp.status_code = 200
+ mock_resp.json.return_value = GUOBIAO_JSON
+ mock_resp.raise_for_status = MagicMock()
+
+ with patch("httpx.get", return_value=mock_resp):
+ crawler = GuobiaoMandatoryCrawler()
+ events = crawler.fetch(limit=10)
+
+ assert len(events) >= 1
+ assert events[0].source == "国标委"
+ assert events[0].standard_code == "GB 18384-2025"
+
+
+EURLEX_RSS = """
+
+
+ EUR-Lex
+ -
+ Regulation (EU) 2024/1689 — AI Act
+ https://eur-lex.europa.eu/legal-content/EN/TXT/?uri=CELEX:32024R1689
+ The EU Artificial Intelligence Act enters into force.
+ Fri, 12 Jul 2024 00:00:00 GMT
+
+
+"""
+
+
+def test_eurlex_crawler_parses_rss():
+ from app.infrastructure.perception.crawlers.eurlex_crawler import EurlexCrawler
+
+ mock_resp = MagicMock()
+ mock_resp.status_code = 200
+ mock_resp.text = EURLEX_RSS
+ mock_resp.content = EURLEX_RSS
+ mock_resp.raise_for_status = MagicMock()
+
+ with patch("httpx.get", return_value=mock_resp):
+ crawler = EurlexCrawler()
+ events = crawler.fetch(limit=5)
+
+ assert isinstance(events, list)
+ assert len(events) >= 1
+ assert events[0].source == "EUR-Lex"
diff --git a/backend/tests/perception/test_llm_pipeline.py b/backend/tests/perception/test_llm_pipeline.py
new file mode 100644
index 0000000..f828f01
--- /dev/null
+++ b/backend/tests/perception/test_llm_pipeline.py
@@ -0,0 +1,77 @@
+"""Unit tests for LlmPipeline — mock LLM client and embedding provider."""
+from __future__ import annotations
+from unittest.mock import MagicMock, patch
+import json
+import pytest
+
+
+def _make_pipeline():
+ with patch("app.infrastructure.perception.llm_pipeline.get_llm_client") as mock_llm_fn, \
+ patch("app.infrastructure.perception.llm_pipeline.OpenAICompatibleEmbeddingProvider") as mock_emb_cls:
+
+ mock_client = MagicMock()
+ mock_client.chat.return_value = MagicMock(content='{"obligations":[{"text":"test obligation","deontic":"must","subject":"OEM","object":"system","condition":""}],"deadlines":[{"date":"2026-07-01","description":"实施截止"}],"scope":"适用于M1类车辆","penalties":"罚款","impact_level":"high"}')
+ mock_llm_fn.return_value = mock_client
+
+ mock_emb = MagicMock()
+ mock_emb.embed_texts.return_value = [[0.1] * 1024, [0.9] * 1024]
+ mock_emb_cls.return_value = mock_emb
+
+ from app.infrastructure.perception.llm_pipeline import LlmPipeline
+ return LlmPipeline(), mock_client, mock_emb
+
+
+def test_extract_structure_returns_dict():
+ pipeline, mock_client, _ = _make_pipeline()
+ event = {
+ "id": "evt-001",
+ "standard_code": "GB 18384-2025",
+ "title": "电动汽车安全要求",
+ "summary": "新增 IP67 级别防护",
+ "source_label": "CATARC",
+ "tags": ["电池安全"],
+ }
+ result = pipeline.extract_structure(event)
+ assert isinstance(result, dict)
+ assert "obligations" in result
+ assert "impact_level" in result
+
+
+def test_assess_impact_returns_list():
+ pipeline, mock_client, _ = _make_pipeline()
+ mock_client.chat.return_value = MagicMock(content='[{"doc_id":"d1","doc_name":"Safety Manual","score":0.85,"key_clauses":"§4.2","recommendation":"更新第4章"}]')
+ mock_retrieval = MagicMock()
+ chunk = MagicMock()
+ chunk.doc_id = "d1"
+ chunk.doc_title = "Safety Manual"
+ chunk.score = 0.85
+ chunk.text = "relevant text"
+ chunk.section_title = "§4.2"
+ mock_retrieval.retrieve.return_value = [chunk]
+ event = {
+ "standard_code": "GB 18384-2025",
+ "title": "电动汽车安全要求",
+ "obligations": [{"text": "OEM shall comply"}],
+ }
+ result = pipeline.assess_impact(event, mock_retrieval)
+ assert isinstance(result, list)
+
+
+def test_compute_diff_no_change():
+ pipeline, _, mock_emb = _make_pipeline()
+ mock_emb.embed_texts.return_value = [[0.5] * 1024, [0.5] * 1024]
+ result = pipeline.compute_diff("paragraph one", "paragraph one")
+ assert isinstance(result, dict)
+ assert "changed_sections" in result
+ assert "change_summary" in result
+
+
+def test_compute_diff_detects_change():
+ pipeline, mock_client, mock_emb = _make_pipeline()
+ mock_emb.embed_texts.return_value = [
+ [1.0] + [0.0] * 1023,
+ [0.0] + [1.0] + [0.0] * 1022,
+ ]
+ mock_client.chat.return_value = MagicMock(content='{"change_type":"tightened","summary":"Requirement tightened"}')
+ result = pipeline.compute_diff("old paragraph text", "new tighter requirement text")
+ assert isinstance(result["changed_sections"], list)
diff --git a/backend/tests/perception/test_postgres_event_store.py b/backend/tests/perception/test_postgres_event_store.py
new file mode 100644
index 0000000..95957bf
--- /dev/null
+++ b/backend/tests/perception/test_postgres_event_store.py
@@ -0,0 +1,98 @@
+"""Unit tests for PostgresEventStore using a mocked psycopg2 pool."""
+from __future__ import annotations
+import json
+from unittest.mock import MagicMock, patch
+import pytest
+
+# Patch psycopg2 before importing the module under test
+import sys
+mock_psycopg2 = MagicMock()
+mock_psycopg2.extras = MagicMock()
+sys.modules.setdefault("psycopg2", mock_psycopg2)
+sys.modules.setdefault("psycopg2.extras", mock_psycopg2.extras)
+sys.modules.setdefault("psycopg2.pool", MagicMock())
+
+from app.infrastructure.perception.base_event_store import BaseEventStore
+
+
+SAMPLE_ROW = {
+ "id": "pg-001",
+ "source": "国标委",
+ "source_label": "国家标准化管理委员会",
+ "standard_code": "GB 18384-2025",
+ "title": "电动汽车安全要求",
+ "summary": "新增要求",
+ "full_text_url": "https://openstd.samr.gov.cn",
+ "status": "enacted",
+ "impact_level": "high",
+ "published_at": "2025-11-15",
+ "effective_at": "2026-07-01",
+ "category": "电动汽车安全",
+ "tags": ["电池安全"],
+ "obligations": None,
+ "deadlines": None,
+ "scope": None,
+ "penalties": None,
+ "content_hash": "abc123",
+ "previous_hash": None,
+ "change_summary": None,
+ "changed_sections": None,
+ "affected_docs": None,
+ "crawled_at": "2026-06-05T10:00:00+00:00",
+ "processed_at": None,
+ "raw_storage_key": None,
+}
+
+
+def _make_store_with_pool(mock_pool):
+ with patch("psycopg2.pool.ThreadedConnectionPool", return_value=mock_pool):
+ with patch(
+ "app.infrastructure.perception.postgres_event_store.PostgresEventStore._ensure_schema"
+ ):
+ from app.infrastructure.perception.postgres_event_store import PostgresEventStore
+ return PostgresEventStore()
+
+
+def _cursor_returning(rows):
+ cursor = MagicMock()
+ cursor.__enter__ = lambda s: s
+ cursor.__exit__ = MagicMock(return_value=False)
+ cursor.fetchall.return_value = rows
+ cursor.fetchone.return_value = rows[0] if rows else None
+ return cursor
+
+
+def test_is_base_event_store():
+ mock_pool = MagicMock()
+ store = _make_store_with_pool(mock_pool)
+ assert isinstance(store, BaseEventStore)
+
+
+def test_filter_returns_list():
+ mock_pool = MagicMock()
+ conn = MagicMock()
+ conn.__enter__ = lambda s: s
+ conn.__exit__ = MagicMock(return_value=False)
+ cursor = _cursor_returning([SAMPLE_ROW])
+ conn.cursor.return_value = cursor
+ mock_pool.getconn.return_value = conn
+ store = _make_store_with_pool(mock_pool)
+ result = store.filter(limit=10)
+ assert isinstance(result, list)
+
+
+def test_stats_returns_correct_keys():
+ mock_pool = MagicMock()
+ conn = MagicMock()
+ conn.__enter__ = lambda s: s
+ conn.__exit__ = MagicMock(return_value=False)
+ cursor = MagicMock()
+ cursor.__enter__ = lambda s: s
+ cursor.__exit__ = MagicMock(return_value=False)
+ cursor.fetchone.return_value = {"count": 5}
+ conn.cursor.return_value = cursor
+ mock_pool.getconn.return_value = conn
+ store = _make_store_with_pool(mock_pool)
+ stats = store.stats()
+ for key in ("total", "high_impact", "medium_impact", "recent_90d"):
+ assert key in stats
diff --git a/docs/superpowers/plans/2026-06-05-perception-intelligence.md b/docs/superpowers/plans/2026-06-05-perception-intelligence.md
new file mode 100644
index 0000000..319404b
--- /dev/null
+++ b/docs/superpowers/plans/2026-06-05-perception-intelligence.md
@@ -0,0 +1,2500 @@
+# Regulatory Signals Intelligence Enhancement — Implementation Plan
+
+> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking.
+
+**Goal:** Replace MockEventStore with real regulatory data from CATARC / 国标委 / EUR-Lex / UN-ECE, add LLM-driven structure extraction + impact assessment + semantic diff, and expose all of this through a manual-trigger crawl UI.
+
+**Architecture:** New `BaseEventStore` ABC → `PostgresEventStore` implementation (psycopg2, same pattern as `PostgresDocumentRepository`) → `CrawlService` orchestrates 4 crawlers + `LlmPipeline` → 3 new API endpoints (SSE crawl progress, single-event process, diff detail) → `bootstrap.py` selects store by `DOCUMENT_REPOSITORY_BACKEND` → frontend adds crawl bar + detail tabs.
+
+**Tech Stack:** httpx (already in requirements), BeautifulSoup4 + lxml (new), psycopg2-binary (already present), existing LLM factory (`app.services.llm.llm_factory`), existing `OpenAICompatibleEmbeddingProvider` for semantic diff, FastAPI SSE (existing pattern from `perception.py` + `async_utils.iter_in_thread`).
+
+---
+
+## File Map
+
+| Action | Path | Purpose |
+|--------|------|---------|
+| Create | `backend/app/infrastructure/perception/base_event_store.py` | ABC with `all/get/filter/stats/upsert/get_by_standard_code` |
+| Modify | `backend/app/infrastructure/perception/mock_event_store.py` | Inherit `BaseEventStore` |
+| Create | `backend/app/infrastructure/perception/postgres_event_store.py` | PostgreSQL-backed store |
+| Create | `backend/app/infrastructure/perception/crawlers/__init__.py` | Package init |
+| Create | `backend/app/infrastructure/perception/crawlers/base.py` | `RawEvent` dataclass + `BaseCrawler` ABC |
+| Create | `backend/app/infrastructure/perception/crawlers/catarc_crawler.py` | CATARC scraper |
+| Create | `backend/app/infrastructure/perception/crawlers/guobiao_crawler.py` | 国标委 JSON API crawler |
+| Create | `backend/app/infrastructure/perception/crawlers/eurlex_crawler.py` | EUR-Lex RSS + CELLAR |
+| Create | `backend/app/infrastructure/perception/llm_pipeline.py` | Extract / assess / diff |
+| Create | `backend/app/application/perception/crawl_service.py` | Orchestrates crawlers + pipeline |
+| Modify | `backend/app/application/perception/services.py` | Type hint: `BaseEventStore` instead of `MockEventStore` |
+| Modify | `backend/app/api/routes/perception.py` | Add 3 new endpoints |
+| Modify | `backend/app/shared/bootstrap.py` | Wire new classes; add `get_crawl_service()` |
+| Modify | `backend/app/config/settings.py` | 3 new perception settings |
+| Modify | `backend/.env` + `.env.example` | New env vars |
+| Modify | `backend/requirements.txt` | Add beautifulsoup4, lxml |
+| Modify | `frontend/src/pages/Perception/PerceptionPage.tsx` | Crawl bar + detail tabs |
+| Create | `backend/tests/perception/__init__.py` | Test package |
+| Create | `backend/tests/perception/test_base_event_store.py` | BaseEventStore contract tests |
+| Create | `backend/tests/perception/test_postgres_event_store.py` | PostgresEventStore unit tests (mock psycopg2) |
+| Create | `backend/tests/perception/test_crawlers.py` | Crawler unit tests (mock httpx) |
+| Create | `backend/tests/perception/test_llm_pipeline.py` | Pipeline unit tests (mock LLM + embed) |
+| Create | `backend/tests/perception/test_crawl_service.py` | CrawlService integration tests |
+
+---
+
+## Task 1: BaseEventStore ABC + MockEventStore implements it
+
+**Files:**
+- Create: `backend/app/infrastructure/perception/base_event_store.py`
+- Modify: `backend/app/infrastructure/perception/mock_event_store.py`
+- Create: `backend/tests/perception/__init__.py`
+- Create: `backend/tests/perception/test_base_event_store.py`
+
+- [ ] **Step 1: Write the failing test**
+
+```python
+# backend/tests/perception/__init__.py
+# (empty)
+```
+
+```python
+# backend/tests/perception/test_base_event_store.py
+"""Contract tests: any BaseEventStore implementation must pass these."""
+from app.infrastructure.perception.base_event_store import BaseEventStore
+from app.infrastructure.perception.mock_event_store import MockEventStore
+
+
+def _store() -> BaseEventStore:
+ return MockEventStore()
+
+
+def test_is_base_event_store():
+ assert isinstance(_store(), BaseEventStore)
+
+
+def test_all_returns_list():
+ result = _store().all()
+ assert isinstance(result, list)
+ assert len(result) > 0
+
+
+def test_get_known_id():
+ store = _store()
+ first = store.all()[0]
+ result = store.get(first["id"])
+ assert result is not None
+ assert result["id"] == first["id"]
+
+
+def test_get_unknown_returns_none():
+ assert _store().get("does-not-exist") is None
+
+
+def test_filter_by_impact():
+ store = _store()
+ highs = store.filter(impact_level="high", limit=100)
+ assert all(e["impact_level"] == "high" for e in highs)
+
+
+def test_filter_limit():
+ store = _store()
+ result = store.filter(limit=3)
+ assert len(result) <= 3
+
+
+def test_stats_keys():
+ stats = _store().stats()
+ for key in ("total", "high_impact", "medium_impact", "recent_90d"):
+ assert key in stats, f"missing key: {key}"
+
+
+def test_upsert_and_get():
+ store = _store()
+ event = {
+ "id": "test-upsert-001",
+ "source": "TEST",
+ "source_label": "Test Source",
+ "standard_code": "TST-001",
+ "title": "Test Event",
+ "summary": "A test event",
+ "full_text_url": "https://example.com",
+ "status": "draft",
+ "impact_level": "low",
+ "published_at": "2026-01-01",
+ "effective_at": None,
+ "category": "test",
+ "tags": ["test"],
+ "content_hash": "abc123",
+ "previous_hash": None,
+ }
+ store.upsert(event)
+ result = store.get("test-upsert-001")
+ assert result is not None
+ assert result["title"] == "Test Event"
+
+
+def test_get_by_standard_code():
+ store = _store()
+ first = store.all()[0]
+ result = store.get_by_standard_code(first["standard_code"])
+ assert result is not None
+ assert result["standard_code"] == first["standard_code"]
+```
+
+- [ ] **Step 2: Run test to verify it fails**
+
+```
+cd backend && PYTHONPATH=. pytest tests/perception/test_base_event_store.py -v
+```
+Expected: ImportError on `base_event_store`
+
+- [ ] **Step 3: Create BaseEventStore ABC**
+
+```python
+# backend/app/infrastructure/perception/base_event_store.py
+"""Abstract base class for regulatory event stores."""
+
+from __future__ import annotations
+
+from abc import ABC, abstractmethod
+
+
+class BaseEventStore(ABC):
+ """Port interface for regulatory event persistence."""
+
+ @abstractmethod
+ def all(self) -> list[dict]:
+ """Return all events, most-recent first."""
+
+ @abstractmethod
+ def get(self, event_id: str) -> dict | None:
+ """Return a single event by ID, or None."""
+
+ @abstractmethod
+ def filter(
+ self,
+ *,
+ source: str | None = None,
+ impact_level: str | None = None,
+ limit: int = 50,
+ ) -> list[dict]:
+ """Return filtered events sorted by published_at descending."""
+
+ @abstractmethod
+ def stats(self) -> dict:
+ """Return {total, high_impact, medium_impact, recent_90d}."""
+
+ @abstractmethod
+ def upsert(self, event: dict) -> None:
+ """Insert or update an event record."""
+
+ @abstractmethod
+ def get_by_standard_code(self, standard_code: str) -> dict | None:
+ """Return the most-recent event with matching standard_code, or None."""
+```
+
+- [ ] **Step 4: Patch MockEventStore to inherit BaseEventStore and add new methods**
+
+Open `backend/app/infrastructure/perception/mock_event_store.py`.
+
+Add at the top (after existing imports):
+```python
+from app.infrastructure.perception.base_event_store import BaseEventStore
+```
+
+Change class definition from:
+```python
+class MockEventStore:
+```
+to:
+```python
+class MockEventStore(BaseEventStore):
+```
+
+Add these two methods at the end of `MockEventStore`, after `stats()`:
+```python
+ def upsert(self, event: dict) -> None:
+ """Insert or update event in the in-memory list (used in tests)."""
+ existing = _EVENT_INDEX.get(event["id"])
+ if existing:
+ existing.update(event)
+ else:
+ MOCK_EVENTS.append(event)
+ _EVENT_INDEX[event["id"]] = event
+
+ def get_by_standard_code(self, standard_code: str) -> dict | None:
+ """Return most-recent event with matching standard_code."""
+ matches = [e for e in MOCK_EVENTS if e.get("standard_code") == standard_code]
+ if not matches:
+ return None
+ return max(matches, key=lambda e: e.get("published_at", ""))
+```
+
+- [ ] **Step 5: Run tests — expect PASS**
+
+```
+cd backend && PYTHONPATH=. pytest tests/perception/test_base_event_store.py -v
+```
+Expected: 8 tests PASS
+
+---
+
+## Task 2: PostgresEventStore
+
+**Files:**
+- Create: `backend/app/infrastructure/perception/postgres_event_store.py`
+- Create: `backend/tests/perception/test_postgres_event_store.py`
+
+- [ ] **Step 1: Write the failing test (mock psycopg2)**
+
+```python
+# backend/tests/perception/test_postgres_event_store.py
+"""Unit tests for PostgresEventStore using a mocked psycopg2 pool."""
+from __future__ import annotations
+import json
+from unittest.mock import MagicMock, patch, call
+import pytest
+
+# Patch psycopg2 before importing the module under test
+import sys
+mock_psycopg2 = MagicMock()
+mock_psycopg2.extras = MagicMock()
+sys.modules.setdefault("psycopg2", mock_psycopg2)
+sys.modules.setdefault("psycopg2.extras", mock_psycopg2.extras)
+sys.modules.setdefault("psycopg2.pool", MagicMock())
+
+from app.infrastructure.perception.base_event_store import BaseEventStore
+
+
+SAMPLE_ROW = {
+ "id": "pg-001",
+ "source": "国标委",
+ "source_label": "国家标准化管理委员会",
+ "standard_code": "GB 18384-2025",
+ "title": "电动汽车安全要求",
+ "summary": "新增要求",
+ "full_text_url": "https://openstd.samr.gov.cn",
+ "status": "enacted",
+ "impact_level": "high",
+ "published_at": "2025-11-15",
+ "effective_at": "2026-07-01",
+ "category": "电动汽车安全",
+ "tags": ["电池安全"],
+ "obligations": None,
+ "deadlines": None,
+ "scope": None,
+ "penalties": None,
+ "content_hash": "abc123",
+ "previous_hash": None,
+ "change_summary": None,
+ "changed_sections": None,
+ "affected_docs": None,
+ "crawled_at": "2026-06-05T10:00:00+00:00",
+ "processed_at": None,
+ "raw_storage_key": None,
+}
+
+
+def _make_store_with_pool(mock_pool):
+ with patch("psycopg2.pool.ThreadedConnectionPool", return_value=mock_pool):
+ with patch(
+ "app.infrastructure.perception.postgres_event_store.PostgresEventStore._ensure_schema"
+ ):
+ from app.infrastructure.perception.postgres_event_store import PostgresEventStore
+ return PostgresEventStore()
+
+
+def _cursor_returning(rows):
+ cursor = MagicMock()
+ cursor.__enter__ = lambda s: s
+ cursor.__exit__ = MagicMock(return_value=False)
+ cursor.fetchall.return_value = rows
+ cursor.fetchone.return_value = rows[0] if rows else None
+ return cursor
+
+
+def test_is_base_event_store():
+ mock_pool = MagicMock()
+ store = _make_store_with_pool(mock_pool)
+ assert isinstance(store, BaseEventStore)
+
+
+def test_filter_returns_list():
+ mock_pool = MagicMock()
+ conn = MagicMock()
+ conn.__enter__ = lambda s: s
+ conn.__exit__ = MagicMock(return_value=False)
+ cursor = _cursor_returning([SAMPLE_ROW])
+ conn.cursor.return_value = cursor
+ mock_pool.getconn.return_value = conn
+ store = _make_store_with_pool(mock_pool)
+ result = store.filter(limit=10)
+ assert isinstance(result, list)
+
+
+def test_stats_returns_correct_keys():
+ mock_pool = MagicMock()
+ conn = MagicMock()
+ conn.__enter__ = lambda s: s
+ conn.__exit__ = MagicMock(return_value=False)
+ # stats runs 4 queries
+ cursor = MagicMock()
+ cursor.__enter__ = lambda s: s
+ cursor.__exit__ = MagicMock(return_value=False)
+ cursor.fetchone.return_value = {"count": 5}
+ conn.cursor.return_value = cursor
+ mock_pool.getconn.return_value = conn
+ store = _make_store_with_pool(mock_pool)
+ stats = store.stats()
+ for key in ("total", "high_impact", "medium_impact", "recent_90d"):
+ assert key in stats
+```
+
+- [ ] **Step 2: Run test to verify it fails**
+
+```
+cd backend && PYTHONPATH=. pytest tests/perception/test_postgres_event_store.py -v
+```
+Expected: ImportError on `postgres_event_store`
+
+- [ ] **Step 3: Implement PostgresEventStore**
+
+```python
+# backend/app/infrastructure/perception/postgres_event_store.py
+"""PostgreSQL-backed regulatory event store."""
+
+from __future__ import annotations
+
+import json
+from contextlib import contextmanager
+from datetime import UTC, date, datetime, timedelta
+from typing import Any
+
+import psycopg2
+import psycopg2.extras
+from psycopg2.pool import ThreadedConnectionPool
+
+from app.config.settings import settings
+from app.infrastructure.perception.base_event_store import BaseEventStore
+
+_CREATE_TABLE = """
+CREATE TABLE IF NOT EXISTS regulation_events (
+ id TEXT PRIMARY KEY,
+ source TEXT NOT NULL,
+ source_label TEXT,
+ standard_code TEXT NOT NULL,
+ title TEXT NOT NULL,
+ summary TEXT,
+ full_text_url TEXT,
+ status TEXT,
+ impact_level TEXT,
+ published_at DATE,
+ effective_at DATE,
+ category TEXT,
+ tags TEXT[],
+ obligations JSONB,
+ deadlines JSONB,
+ scope TEXT,
+ penalties TEXT,
+ content_hash TEXT,
+ previous_hash TEXT,
+ change_summary TEXT,
+ changed_sections JSONB,
+ affected_docs JSONB,
+ crawled_at TIMESTAMPTZ DEFAULT now(),
+ processed_at TIMESTAMPTZ,
+ raw_storage_key TEXT
+);
+CREATE INDEX IF NOT EXISTS reg_events_source_date
+ ON regulation_events (source, published_at DESC);
+CREATE INDEX IF NOT EXISTS reg_events_impact_date
+ ON regulation_events (impact_level, published_at DESC);
+"""
+
+_ALL_COLUMNS = (
+ "id", "source", "source_label", "standard_code", "title", "summary",
+ "full_text_url", "status", "impact_level", "published_at", "effective_at",
+ "category", "tags", "obligations", "deadlines", "scope", "penalties",
+ "content_hash", "previous_hash", "change_summary", "changed_sections",
+ "affected_docs", "crawled_at", "processed_at", "raw_storage_key",
+)
+
+
+def _row_to_dict(row: dict[str, Any]) -> dict:
+ """Convert a psycopg2 RealDictRow to a plain dict with serialized JSON fields."""
+ d = dict(row)
+ for field in ("obligations", "deadlines", "changed_sections", "affected_docs"):
+ val = d.get(field)
+ if isinstance(val, str):
+ d[field] = json.loads(val)
+ for date_field in ("published_at", "effective_at"):
+ val = d.get(date_field)
+ if isinstance(val, date):
+ d[date_field] = val.isoformat()
+ for ts_field in ("crawled_at", "processed_at"):
+ val = d.get(ts_field)
+ if isinstance(val, datetime):
+ d[ts_field] = val.isoformat()
+ return d
+
+
+class PostgresEventStore(BaseEventStore):
+ """Regulatory event store backed by PostgreSQL."""
+
+ def __init__(self) -> None:
+ self._pool = ThreadedConnectionPool(
+ minconn=1,
+ maxconn=5,
+ host=settings.postgres_host,
+ port=settings.postgres_port,
+ user=settings.postgres_user,
+ password=settings.postgres_password,
+ dbname=settings.postgres_db,
+ )
+ self._ensure_schema()
+
+ def _ensure_schema(self) -> None:
+ with self._conn() as conn:
+ with conn.cursor() as cur:
+ cur.execute(_CREATE_TABLE)
+ conn.commit()
+
+ @contextmanager
+ def _conn(self):
+ conn = self._pool.getconn()
+ try:
+ yield conn
+ finally:
+ self._pool.putconn(conn)
+
+ def all(self) -> list[dict]:
+ with self._conn() as conn:
+ with conn.cursor(cursor_factory=psycopg2.extras.RealDictCursor) as cur:
+ cur.execute(
+ "SELECT * FROM regulation_events ORDER BY published_at DESC NULLS LAST"
+ )
+ return [_row_to_dict(r) for r in cur.fetchall()]
+
+ def get(self, event_id: str) -> dict | None:
+ with self._conn() as conn:
+ with conn.cursor(cursor_factory=psycopg2.extras.RealDictCursor) as cur:
+ cur.execute(
+ "SELECT * FROM regulation_events WHERE id = %s", (event_id,)
+ )
+ row = cur.fetchone()
+ return _row_to_dict(row) if row else None
+
+ def filter(
+ self,
+ *,
+ source: str | None = None,
+ impact_level: str | None = None,
+ limit: int = 50,
+ ) -> list[dict]:
+ conditions: list[str] = []
+ params: list[Any] = []
+ if source:
+ conditions.append("source = %s")
+ params.append(source)
+ if impact_level:
+ conditions.append("impact_level = %s")
+ params.append(impact_level)
+ where = ("WHERE " + " AND ".join(conditions)) if conditions else ""
+ params.append(limit)
+ sql = f"""
+ SELECT * FROM regulation_events
+ {where}
+ ORDER BY published_at DESC NULLS LAST
+ LIMIT %s
+ """
+ with self._conn() as conn:
+ with conn.cursor(cursor_factory=psycopg2.extras.RealDictCursor) as cur:
+ cur.execute(sql, params)
+ return [_row_to_dict(r) for r in cur.fetchall()]
+
+ def stats(self) -> dict:
+ cutoff = (date.today() - timedelta(days=90)).isoformat()
+ with self._conn() as conn:
+ with conn.cursor(cursor_factory=psycopg2.extras.RealDictCursor) as cur:
+ cur.execute("SELECT COUNT(*) AS count FROM regulation_events")
+ total = (cur.fetchone() or {}).get("count", 0)
+ cur.execute(
+ "SELECT COUNT(*) AS count FROM regulation_events WHERE impact_level = 'high'"
+ )
+ high = (cur.fetchone() or {}).get("count", 0)
+ cur.execute(
+ "SELECT COUNT(*) AS count FROM regulation_events WHERE impact_level = 'medium'"
+ )
+ medium = (cur.fetchone() or {}).get("count", 0)
+ cur.execute(
+ "SELECT COUNT(*) AS count FROM regulation_events WHERE published_at >= %s",
+ (cutoff,),
+ )
+ recent = (cur.fetchone() or {}).get("count", 0)
+ return {
+ "total": int(total),
+ "high_impact": int(high),
+ "medium_impact": int(medium),
+ "recent_90d": int(recent),
+ }
+
+ def upsert(self, event: dict) -> None:
+ """Insert or update a regulation event."""
+ cols = [c for c in _ALL_COLUMNS if c in event]
+ placeholders = ", ".join(f"%({c})s" for c in cols)
+ updates = ", ".join(f"{c} = EXCLUDED.{c}" for c in cols if c != "id")
+ sql = f"""
+ INSERT INTO regulation_events ({', '.join(cols)})
+ VALUES ({placeholders})
+ ON CONFLICT (id) DO UPDATE SET {updates}
+ """
+ row: dict[str, Any] = {}
+ for c in cols:
+ val = event.get(c)
+ if c in ("obligations", "deadlines", "changed_sections", "affected_docs") and val is not None:
+ row[c] = json.dumps(val, ensure_ascii=False)
+ elif c == "tags" and isinstance(val, list):
+ row[c] = val # psycopg2 handles list→array
+ else:
+ row[c] = val
+ with self._conn() as conn:
+ with conn.cursor() as cur:
+ cur.execute(sql, row)
+ conn.commit()
+
+ def get_by_standard_code(self, standard_code: str) -> dict | None:
+ with self._conn() as conn:
+ with conn.cursor(cursor_factory=psycopg2.extras.RealDictCursor) as cur:
+ cur.execute(
+ """SELECT * FROM regulation_events
+ WHERE standard_code = %s
+ ORDER BY published_at DESC NULLS LAST
+ LIMIT 1""",
+ (standard_code,),
+ )
+ row = cur.fetchone()
+ return _row_to_dict(row) if row else None
+```
+
+- [ ] **Step 4: Run tests — expect PASS**
+
+```
+cd backend && PYTHONPATH=. pytest tests/perception/test_postgres_event_store.py -v
+```
+Expected: 3 tests PASS
+
+---
+
+## Task 3: Crawler base + CATARC crawler
+
+**Files:**
+- Create: `backend/app/infrastructure/perception/crawlers/__init__.py`
+- Create: `backend/app/infrastructure/perception/crawlers/base.py`
+- Create: `backend/app/infrastructure/perception/crawlers/catarc_crawler.py`
+- Create: `backend/tests/perception/test_crawlers.py`
+
+- [ ] **Step 1: Write failing test**
+
+```python
+# backend/tests/perception/test_crawlers.py
+"""Unit tests for crawlers — mock httpx responses."""
+from __future__ import annotations
+from unittest.mock import MagicMock, patch
+import pytest
+
+from app.infrastructure.perception.crawlers.base import RawEvent, BaseCrawler
+
+
+def test_raw_event_fields():
+ ev = RawEvent(
+ source="TEST",
+ source_label="Test",
+ standard_code="TST-001",
+ title="Test",
+ summary="Summary",
+ full_text_url="https://example.com",
+ status="enacted",
+ published_at="2026-01-01",
+ effective_at=None,
+ category="test",
+ tags=["a"],
+ raw_text="full text here",
+ )
+ assert ev.source == "TEST"
+ assert ev.tags == ["a"]
+
+
+CATARC_HTML = """
+
+
+
+"""
+
+
+def test_catarc_crawler_parses_html():
+ from app.infrastructure.perception.crawlers.catarc_crawler import CatarcCrawler
+
+ mock_resp = MagicMock()
+ mock_resp.status_code = 200
+ mock_resp.text = CATARC_HTML
+ mock_resp.raise_for_status = MagicMock()
+
+ with patch("httpx.get", return_value=mock_resp):
+ crawler = CatarcCrawler()
+ events = crawler.fetch(limit=10)
+
+ assert isinstance(events, list)
+ assert len(events) >= 1
+ assert all(isinstance(e, RawEvent) for e in events)
+ codes = [e.standard_code for e in events]
+ assert "GB 18384-2025" in codes
+
+
+GUOBIAO_JSON = {
+ "rows": [
+ {
+ "std_code": "GB 18384-2025",
+ "std_name": "电动汽车安全要求",
+ "release_date": "2025-11-15",
+ "implement_date": "2026-07-01",
+ "std_status": "现行",
+ "std_type": "强制性",
+ },
+ ]
+}
+
+
+def test_guobiao_crawler_parses_json():
+ from app.infrastructure.perception.crawlers.guobiao_crawler import GuobiaoMandatoryCrawler
+
+ mock_resp = MagicMock()
+ mock_resp.status_code = 200
+ mock_resp.json.return_value = GUOBIAO_JSON
+ mock_resp.raise_for_status = MagicMock()
+
+ with patch("httpx.get", return_value=mock_resp):
+ crawler = GuobiaoMandatoryCrawler()
+ events = crawler.fetch(limit=10)
+
+ assert len(events) >= 1
+ assert events[0].source == "国标委"
+ assert events[0].standard_code == "GB 18384-2025"
+```
+
+- [ ] **Step 2: Run test to verify it fails**
+
+```
+cd backend && PYTHONPATH=. pytest tests/perception/test_crawlers.py -v
+```
+Expected: ImportError
+
+- [ ] **Step 3: Create crawler base**
+
+```python
+# backend/app/infrastructure/perception/crawlers/__init__.py
+```
+
+```python
+# backend/app/infrastructure/perception/crawlers/base.py
+"""Shared contracts for regulatory source crawlers."""
+
+from __future__ import annotations
+
+from abc import ABC, abstractmethod
+from dataclasses import dataclass, field
+
+
+@dataclass
+class RawEvent:
+ """Raw regulatory event returned by a crawler before enrichment."""
+
+ source: str
+ source_label: str
+ standard_code: str
+ title: str
+ summary: str
+ full_text_url: str
+ status: str # 'enacted' | 'draft' | 'consultation'
+ published_at: str # YYYY-MM-DD string
+ effective_at: str | None
+ category: str
+ tags: list[str] = field(default_factory=list)
+ raw_text: str = "" # full crawled text for hashing + LLM
+
+
+class BaseCrawler(ABC):
+ """Abstract regulatory source crawler."""
+
+ @abstractmethod
+ def fetch(self, limit: int = 50) -> list[RawEvent]:
+ """Fetch up to `limit` recent events from the data source."""
+```
+
+- [ ] **Step 4: Create CATARC crawler**
+
+```python
+# backend/app/infrastructure/perception/crawlers/catarc_crawler.py
+"""Crawler for CATARC automotive standard catalogue."""
+
+from __future__ import annotations
+
+import hashlib
+
+import httpx
+from bs4 import BeautifulSoup
+from loguru import logger
+
+from app.infrastructure.perception.crawlers.base import BaseCrawler, RawEvent
+
+_BASE_URL = "https://www.catarc.org.cn/bzzxd/qcbz/index.html"
+_HOST = "https://www.catarc.org.cn"
+
+# Status strings appearing on the CATARC site mapped to our vocabulary.
+_STATUS_MAP = {
+ "现行": "enacted",
+ "即将实施": "enacted",
+ "废止": "enacted",
+ "征求意见": "consultation",
+ "报批": "draft",
+}
+
+
+class CatarcCrawler(BaseCrawler):
+ """Scrape the CATARC automotive standard list page."""
+
+ def fetch(self, limit: int = 50) -> list[RawEvent]:
+ events: list[RawEvent] = []
+ page = 1
+ while len(events) < limit:
+ url = f"{_BASE_URL}?page={page}"
+ try:
+ resp = httpx.get(url, timeout=30, follow_redirects=True)
+ resp.raise_for_status()
+ except Exception as exc:
+ logger.warning("CATARC fetch failed page={} err={}", page, exc)
+ break
+
+ soup = BeautifulSoup(resp.text, "lxml")
+ rows = soup.select("table tr")
+ if not rows:
+ break
+
+ batch: list[RawEvent] = []
+ for row in rows:
+ cells = row.find_all("td")
+ if len(cells) < 3:
+ continue
+ link = cells[0].find("a")
+ standard_code = link.get_text(strip=True) if link else cells[0].get_text(strip=True)
+ title = cells[1].get_text(strip=True) if len(cells) > 1 else standard_code
+ date_text = cells[2].get_text(strip=True) if len(cells) > 2 else ""
+ published_at = _parse_date(date_text)
+ status_text = cells[3].get_text(strip=True) if len(cells) > 3 else ""
+ status = _STATUS_MAP.get(status_text, "enacted")
+ detail_url = (_HOST + link["href"]) if link and link.get("href") else url
+ raw_text = f"{standard_code} {title}"
+ batch.append(RawEvent(
+ source="CATARC",
+ source_label="全国汽车标准化技术委员会",
+ standard_code=standard_code,
+ title=title,
+ summary=title,
+ full_text_url=detail_url,
+ status=status,
+ published_at=published_at,
+ effective_at=None,
+ category="汽车标准",
+ tags=_extract_tags(standard_code, title),
+ raw_text=raw_text,
+ ))
+
+ if not batch:
+ break
+ events.extend(batch)
+ page += 1
+
+ return events[:limit]
+
+
+def _parse_date(text: str) -> str:
+ """Return YYYY-MM-DD from common Chinese date formats, or today's date."""
+ import re
+ from datetime import date
+ text = text.strip()
+ m = re.search(r"(\d{4})[/-](\d{1,2})[/-](\d{1,2})", text)
+ if m:
+ y, mo, d = m.group(1), m.group(2).zfill(2), m.group(3).zfill(2)
+ return f"{y}-{mo}-{d}"
+ m2 = re.search(r"(\d{4})年(\d{1,2})月(\d{1,2})日?", text)
+ if m2:
+ y, mo, d = m2.group(1), m2.group(2).zfill(2), m2.group(3).zfill(2)
+ return f"{y}-{mo}-{d}"
+ return date.today().isoformat()
+
+
+def _extract_tags(standard_code: str, title: str) -> list[str]:
+ """Derive simple keyword tags from standard code and title."""
+ tags: list[str] = []
+ code_upper = standard_code.upper()
+ if "GB" in code_upper:
+ tags.append("国家标准")
+ if "/T" in code_upper:
+ tags.append("推荐性")
+ else:
+ tags.append("强制性")
+ keywords = ["电动", "安全", "自动驾驶", "充电", "智能网联", "碰撞", "排放", "网络安全"]
+ for kw in keywords:
+ if kw in title:
+ tags.append(kw)
+ return tags[:5]
+```
+
+- [ ] **Step 5: Create 国标委 crawler**
+
+```python
+# backend/app/infrastructure/perception/crawlers/guobiao_crawler.py
+"""Crawlers for the 国标委 (SAMR) standard information platform."""
+
+from __future__ import annotations
+
+import httpx
+from loguru import logger
+
+from app.infrastructure.perception.crawlers.base import BaseCrawler, RawEvent
+from app.infrastructure.perception.crawlers.catarc_crawler import _parse_date, _extract_tags
+
+# p.p1=1 → mandatory (强制性); p.p1=2 → recommended (推荐性)
+_BASE_URL = "https://openstd.samr.gov.cn/bzgk/std/std_list_type"
+_HEADERS = {"User-Agent": "Mozilla/5.0 (compatible; RegulatoryBot/1.0)"}
+
+
+def _fetch_page(std_type: int, page: int, page_size: int) -> list[dict]:
+ params = {
+ "p.p1": std_type,
+ "p.p2": "车",
+ "p.p90": "circulation_date",
+ "p.p91": "desc",
+ "p.p6": page,
+ "p.p7": page_size,
+ }
+ try:
+ resp = httpx.get(_BASE_URL, params=params, headers=_HEADERS, timeout=30)
+ resp.raise_for_status()
+ data = resp.json()
+ return data.get("rows", []) or []
+ except Exception as exc:
+ logger.warning("国标委 fetch failed type={} page={} err={}", std_type, page, exc)
+ return []
+
+
+def _row_to_raw_event(row: dict, source_label: str) -> RawEvent:
+ standard_code = row.get("std_code", "")
+ title = row.get("std_name", standard_code)
+ published_at = _parse_date(row.get("release_date", ""))
+ effective_at_raw = row.get("implement_date", "")
+ effective_at = _parse_date(effective_at_raw) if effective_at_raw else None
+ status_text = row.get("std_status", "")
+ if "征求意见" in status_text:
+ status = "consultation"
+ elif "报批" in status_text or "草案" in status_text:
+ status = "draft"
+ else:
+ status = "enacted"
+ return RawEvent(
+ source="国标委",
+ source_label=source_label,
+ standard_code=standard_code,
+ title=title,
+ summary=title,
+ full_text_url=f"https://openstd.samr.gov.cn/bzgk/std/detail?id={row.get('id', '')}",
+ status=status,
+ published_at=published_at,
+ effective_at=effective_at,
+ category=row.get("std_type", "国家标准"),
+ tags=_extract_tags(standard_code, title),
+ raw_text=f"{standard_code} {title}",
+ )
+
+
+class GuobiaoMandatoryCrawler(BaseCrawler):
+ """Fetch mandatory national standards (强制性) related to vehicles."""
+
+ def fetch(self, limit: int = 50) -> list[RawEvent]:
+ events: list[RawEvent] = []
+ page = 1
+ while len(events) < limit:
+ rows = _fetch_page(std_type=1, page=page, page_size=20)
+ if not rows:
+ break
+ events.extend(_row_to_raw_event(r, "国标委·强制性") for r in rows)
+ page += 1
+ return events[:limit]
+
+
+class GuobiaoRecommendedCrawler(BaseCrawler):
+ """Fetch recommended national standards (推荐性) related to vehicles."""
+
+ def fetch(self, limit: int = 50) -> list[RawEvent]:
+ events: list[RawEvent] = []
+ page = 1
+ while len(events) < limit:
+ rows = _fetch_page(std_type=2, page=page, page_size=20)
+ if not rows:
+ break
+ events.extend(_row_to_raw_event(r, "国标委·推荐性") for r in rows)
+ page += 1
+ return events[:limit]
+```
+
+- [ ] **Step 6: Run tests**
+
+```
+cd backend && PYTHONPATH=. pytest tests/perception/test_crawlers.py -v
+```
+Expected: 3 tests PASS
+
+---
+
+## Task 4: EUR-Lex + UN-ECE crawler
+
+**Files:**
+- Create: `backend/app/infrastructure/perception/crawlers/eurlex_crawler.py`
+
+(Tests already created in `test_crawlers.py` — add to existing file)
+
+- [ ] **Step 1: Add EUR-Lex test to existing test file**
+
+Append to `backend/tests/perception/test_crawlers.py`:
+
+```python
+EURLEX_RSS = """
+
+
+ EUR-Lex
+ -
+ Regulation (EU) 2024/1689 — AI Act
+ https://eur-lex.europa.eu/legal-content/EN/TXT/?uri=CELEX:32024R1689
+ The EU Artificial Intelligence Act enters into force.
+ Fri, 12 Jul 2024 00:00:00 GMT
+
+
+"""
+
+
+def test_eurlex_crawler_parses_rss():
+ from app.infrastructure.perception.crawlers.eurlex_crawler import EurlexCrawler
+
+ mock_resp = MagicMock()
+ mock_resp.status_code = 200
+ mock_resp.text = EURLEX_RSS
+ mock_resp.raise_for_status = MagicMock()
+
+ with patch("httpx.get", return_value=mock_resp):
+ crawler = EurlexCrawler()
+ events = crawler.fetch(limit=5)
+
+ assert isinstance(events, list)
+ assert len(events) >= 1
+ assert events[0].source == "EUR-Lex"
+```
+
+- [ ] **Step 2: Run to verify it fails**
+
+```
+cd backend && PYTHONPATH=. pytest tests/perception/test_crawlers.py::test_eurlex_crawler_parses_rss -v
+```
+Expected: ImportError
+
+- [ ] **Step 3: Implement EUR-Lex + UN-ECE crawler**
+
+```python
+# backend/app/infrastructure/perception/crawlers/eurlex_crawler.py
+"""Crawler for EUR-Lex RSS feeds covering EU AI Act and automotive regulations."""
+
+from __future__ import annotations
+
+import re
+from email.utils import parsedate_to_datetime
+
+import httpx
+from bs4 import BeautifulSoup
+from loguru import logger
+
+from app.infrastructure.perception.crawlers.base import BaseCrawler, RawEvent
+from app.infrastructure.perception.crawlers.catarc_crawler import _parse_date
+
+# EUR-Lex predefined RSS: legislation in force (OJ L series)
+_EURLEX_RSS_URLS = [
+ # EU AI Act + automotive-related OJ publications
+ "https://eur-lex.europa.eu/rss-feed/OJ-L.rss",
+]
+
+# UN-ECE automotive regulations via EUR-Lex CELLAR
+_UNECE_CELEX = [
+ "32024R0001", # UN R155 cybersecurity (representative CELEX; adjust as needed)
+ "32024R0002", # UN R156 software updates
+]
+
+_AUTOMOTIVE_KEYWORDS = [
+ "vehicle", "automotive", "motor", "tyre", "emission", "ADAS", "autonomous",
+ "AI Act", "artificial intelligence", "cybersecurity", "software update",
+ "R155", "R156", "汽车", "车辆",
+]
+
+
+def _is_automotive_relevant(title: str, description: str) -> bool:
+ combined = (title + " " + description).lower()
+ return any(kw.lower() in combined for kw in _AUTOMOTIVE_KEYWORDS)
+
+
+def _extract_celex(url: str) -> str:
+ """Extract CELEX number from EUR-Lex URL, or return empty string."""
+ m = re.search(r"CELEX[:/]([0-9A-Z]+)", url)
+ return m.group(1) if m else ""
+
+
+def _parse_rss_date(rfc2822: str) -> str:
+ """Parse RFC-2822 date string → YYYY-MM-DD."""
+ try:
+ dt = parsedate_to_datetime(rfc2822)
+ return dt.date().isoformat()
+ except Exception:
+ return _parse_date(rfc2822)
+
+
+class EurlexCrawler(BaseCrawler):
+ """Fetch automotive-relevant EU regulations from EUR-Lex RSS feeds."""
+
+ def fetch(self, limit: int = 50) -> list[RawEvent]:
+ events: list[RawEvent] = []
+ for rss_url in _EURLEX_RSS_URLS:
+ if len(events) >= limit:
+ break
+ try:
+ resp = httpx.get(rss_url, timeout=30, follow_redirects=True)
+ resp.raise_for_status()
+ except Exception as exc:
+ logger.warning("EUR-Lex RSS fetch failed url={} err={}", rss_url, exc)
+ continue
+
+ soup = BeautifulSoup(resp.text, "lxml-xml")
+ for item in soup.find_all("item"):
+ if len(events) >= limit:
+ break
+ title = (item.find("title") or {}).get_text(strip=True)
+ description = (item.find("description") or {}).get_text(strip=True)
+ link = (item.find("link") or {}).get_text(strip=True)
+ pub_date = (item.find("pubDate") or {}).get_text(strip=True)
+
+ if not _is_automotive_relevant(title, description):
+ continue
+
+ celex = _extract_celex(link)
+ standard_code = celex if celex else title[:60]
+ published_at = _parse_rss_date(pub_date) if pub_date else _parse_date("")
+
+ events.append(RawEvent(
+ source="EUR-Lex",
+ source_label="欧盟官方公报",
+ standard_code=standard_code,
+ title=title,
+ summary=description[:500],
+ full_text_url=link,
+ status="enacted",
+ published_at=published_at,
+ effective_at=None,
+ category="EU法规",
+ tags=_extract_eurlex_tags(title, description),
+ raw_text=f"{title}\n{description}",
+ ))
+
+ return events[:limit]
+
+
+def _extract_eurlex_tags(title: str, description: str) -> list[str]:
+ combined = title + " " + description
+ tag_map = {
+ "AI Act": "EU AI Act",
+ "artificial intelligence": "EU AI Act",
+ "R155": "UN R155",
+ "R156": "UN R156",
+ "cybersecurity": "网络安全",
+ "emission": "排放",
+ "autonomous": "自动驾驶",
+ "ADAS": "ADAS",
+ }
+ tags = []
+ for kw, tag in tag_map.items():
+ if kw.lower() in combined.lower():
+ tags.append(tag)
+ return tags[:5]
+```
+
+- [ ] **Step 4: Run tests**
+
+```
+cd backend && PYTHONPATH=. pytest tests/perception/test_crawlers.py -v
+```
+Expected: 4 tests PASS
+
+---
+
+## Task 5: LLM Pipeline (extract + assess + diff)
+
+**Files:**
+- Create: `backend/app/infrastructure/perception/llm_pipeline.py`
+- Create: `backend/tests/perception/test_llm_pipeline.py`
+
+- [ ] **Step 1: Write the failing test**
+
+```python
+# backend/tests/perception/test_llm_pipeline.py
+"""Unit tests for LlmPipeline — mock LLM client and embedding provider."""
+from __future__ import annotations
+from unittest.mock import MagicMock, patch
+import json
+import pytest
+
+
+def _make_pipeline():
+ with patch("app.infrastructure.perception.llm_pipeline.get_llm_client") as mock_llm_fn, \
+ patch("app.infrastructure.perception.llm_pipeline.OpenAICompatibleEmbeddingProvider") as mock_emb_cls:
+
+ mock_client = MagicMock()
+ mock_client.chat.return_value = MagicMock(content='{"obligations":[{"text":"test obligation","deontic":"must","subject":"OEM","object":"system","condition":""}],"deadlines":[{"date":"2026-07-01","description":"实施截止"}],"scope":"适用于M1类车辆","penalties":"罚款","impact_level":"high"}')
+ mock_llm_fn.return_value = mock_client
+
+ mock_emb = MagicMock()
+ mock_emb.embed_texts.return_value = [[0.1] * 1024, [0.9] * 1024]
+ mock_emb_cls.return_value = mock_emb
+
+ from app.infrastructure.perception.llm_pipeline import LlmPipeline
+ return LlmPipeline(), mock_client, mock_emb
+
+
+def test_extract_structure_returns_dict():
+ pipeline, mock_client, _ = _make_pipeline()
+ event = {
+ "id": "evt-001",
+ "standard_code": "GB 18384-2025",
+ "title": "电动汽车安全要求",
+ "summary": "新增 IP67 级别防护",
+ "source_label": "CATARC",
+ "tags": ["电池安全"],
+ }
+ result = pipeline.extract_structure(event)
+ assert isinstance(result, dict)
+ assert "obligations" in result
+ assert "impact_level" in result
+
+
+def test_assess_impact_returns_list():
+ pipeline, mock_client, _ = _make_pipeline()
+ mock_client.chat.return_value = MagicMock(content='[{"doc_id":"d1","doc_name":"Safety Manual","score":0.85,"key_clauses":"§4.2","recommendation":"更新第4章"}]')
+ mock_retrieval = MagicMock()
+ chunk = MagicMock()
+ chunk.doc_id = "d1"
+ chunk.doc_title = "Safety Manual"
+ chunk.score = 0.85
+ chunk.text = "relevant text"
+ chunk.section_title = "§4.2"
+ mock_retrieval.retrieve.return_value = [chunk]
+ event = {
+ "standard_code": "GB 18384-2025",
+ "title": "电动汽车安全要求",
+ "obligations": [{"text": "OEM shall comply"}],
+ }
+ result = pipeline.assess_impact(event, mock_retrieval)
+ assert isinstance(result, list)
+
+
+def test_compute_diff_no_change():
+ pipeline, _, mock_emb = _make_pipeline()
+ # identical texts → cosine similarity = 1.0 → no changes
+ mock_emb.embed_texts.return_value = [[0.5] * 1024, [0.5] * 1024]
+ result = pipeline.compute_diff("paragraph one", "paragraph one")
+ assert isinstance(result, dict)
+ assert "changed_sections" in result
+ assert "change_summary" in result
+
+
+def test_compute_diff_detects_change():
+ pipeline, mock_client, mock_emb = _make_pipeline()
+ # low cosine similarity → change detected
+ import numpy as np
+ mock_emb.embed_texts.return_value = [
+ [1.0] + [0.0] * 1023,
+ [0.0] + [1.0] + [0.0] * 1022,
+ ]
+ mock_client.chat.return_value = MagicMock(content='{"change_type":"tightened","summary":"Requirement tightened"}')
+ result = pipeline.compute_diff("old paragraph text", "new tighter requirement text")
+ assert isinstance(result["changed_sections"], list)
+```
+
+- [ ] **Step 2: Run to verify it fails**
+
+```
+cd backend && PYTHONPATH=. pytest tests/perception/test_llm_pipeline.py -v
+```
+Expected: ImportError
+
+- [ ] **Step 3: Implement LlmPipeline**
+
+```python
+# backend/app/infrastructure/perception/llm_pipeline.py
+"""LLM-driven pipeline for regulatory event enrichment."""
+
+from __future__ import annotations
+
+import json
+import math
+from typing import Any
+
+from loguru import logger
+
+from app.config.settings import settings
+from app.infrastructure.embedding.openai_compatible_embedding_provider import (
+ OpenAICompatibleEmbeddingProvider,
+)
+from app.services.llm.llm_factory import get_llm_client
+
+_EXTRACT_SYSTEM = (
+ "You are a regulatory compliance expert specialising in automotive standards "
+ "(GB, UN-ECE, ISO, EU). Extract structured information from regulation text. "
+ "Return valid JSON only — no markdown fences, no extra keys."
+)
+
+_ASSESS_SYSTEM = (
+ "You are an automotive compliance analyst. Given a regulation and related document excerpts, "
+ "identify which documents are affected and what actions are required. "
+ "Return a JSON array only."
+)
+
+_DIFF_SYSTEM = (
+ "You are a regulatory change analyst. Given an old and new version of a regulation paragraph, "
+ "classify the type of change and summarise it. "
+ "Return JSON only: {\"change_type\": \"tightened|relaxed|added|removed\", \"summary\": \"...\"}"
+)
+
+_SIMILARITY_THRESHOLD = 0.85
+
+
+def _cosine(a: list[float], b: list[float]) -> float:
+ dot = sum(x * y for x, y in zip(a, b))
+ norm_a = math.sqrt(sum(x * x for x in a))
+ norm_b = math.sqrt(sum(x * x for x in b))
+ if norm_a == 0 or norm_b == 0:
+ return 1.0
+ return dot / (norm_a * norm_b)
+
+
+def _llm_json(client: Any, messages: list[dict]) -> Any:
+ """Call LLM and parse JSON response; return None on failure."""
+ try:
+ resp = client.chat(messages)
+ text = (resp.content or "").strip()
+ # strip markdown fences if model added them despite instructions
+ if text.startswith("```"):
+ text = text.split("```")[1]
+ if text.startswith("json"):
+ text = text[4:]
+ return json.loads(text)
+ except Exception as exc:
+ logger.warning("LLM JSON parse failed: {}", exc)
+ return None
+
+
+class LlmPipeline:
+ """Three-step enrichment pipeline for crawled regulatory events."""
+
+ def __init__(self) -> None:
+ self._client = get_llm_client(
+ provider=settings.llm_provider,
+ model=settings.llm_model,
+ )
+ self._embedder = OpenAICompatibleEmbeddingProvider()
+
+ # ------------------------------------------------------------------
+ # Step 1: Structure extraction
+ # ------------------------------------------------------------------
+
+ def extract_structure(self, event: dict) -> dict:
+ """Extract obligations, deadlines, scope, penalties, impact_level from event text."""
+ prompt = f"""Extract structured compliance information from this regulation:
+
+Standard: {event.get('standard_code', '')}
+Title: {event.get('title', '')}
+Source: {event.get('source_label', '')}
+Summary: {event.get('summary', '')}
+Tags: {', '.join(event.get('tags', []))}
+
+Return JSON with exactly these keys:
+{{
+ "obligations": [{{"text": "...", "deontic": "must|shall|may|prohibited", "subject": "...", "object": "...", "condition": ""}}],
+ "deadlines": [{{"date": "YYYY-MM-DD or null", "description": "..."}}],
+ "scope": "one sentence describing who/what this applies to",
+ "penalties": "one sentence on consequences of non-compliance, or null",
+ "impact_level": "high|medium|low"
+}}"""
+
+ messages = [
+ {"role": "system", "content": _EXTRACT_SYSTEM},
+ {"role": "user", "content": prompt},
+ ]
+ result = _llm_json(self._client, messages)
+ if not isinstance(result, dict):
+ return {
+ "obligations": [],
+ "deadlines": [],
+ "scope": "",
+ "penalties": "",
+ "impact_level": "medium",
+ }
+ return result
+
+ # ------------------------------------------------------------------
+ # Step 2: Impact assessment
+ # ------------------------------------------------------------------
+
+ def assess_impact(self, event: dict, retrieval_service: Any) -> list[dict]:
+ """Use RAG to find affected documents and generate recommendations."""
+ obligations = event.get("obligations") or []
+ obligation_texts = " ".join(o.get("text", "") for o in obligations[:3])
+ query = f"{event.get('standard_code', '')} {event.get('title', '')} {obligation_texts}"
+
+ try:
+ chunks = retrieval_service.retrieve(query=query, top_k=5)
+ except Exception as exc:
+ logger.warning("RAG retrieval failed: {}", exc)
+ return []
+
+ if not chunks:
+ return []
+
+ seen: set[str] = set()
+ doc_excerpts: list[dict] = []
+ for chunk in chunks:
+ if chunk.doc_id not in seen:
+ seen.add(chunk.doc_id)
+ doc_excerpts.append({
+ "doc_id": chunk.doc_id,
+ "doc_name": chunk.doc_title,
+ "score": round(float(chunk.score), 4),
+ "snippet": (chunk.text or "")[:300],
+ "clause": getattr(chunk, "section_title", "") or "",
+ })
+
+ context = "\n".join(
+ f"[{d['doc_name']} {d['clause']}] score={d['score']}: {d['snippet']}"
+ for d in doc_excerpts
+ )
+ prompt = f"""Regulation: {event.get('standard_code')} — {event.get('title')}
+Obligations: {obligation_texts or event.get('summary', '')}
+
+Affected documents found in knowledge base:
+{context}
+
+For each document, assess impact and recommend action. Return JSON array:
+[{{"doc_id":"...","doc_name":"...","score":0.0,"key_clauses":"...","recommendation":"one sentence action"}}]"""
+
+ messages = [
+ {"role": "system", "content": _ASSESS_SYSTEM},
+ {"role": "user", "content": prompt},
+ ]
+ result = _llm_json(self._client, messages)
+ if isinstance(result, list):
+ # merge score from retrieval (more reliable than LLM-invented scores)
+ score_map = {d["doc_id"]: d["score"] for d in doc_excerpts}
+ for item in result:
+ if isinstance(item, dict) and item.get("doc_id") in score_map:
+ item["score"] = score_map[item["doc_id"]]
+ return result
+ return doc_excerpts # fallback: return retrieval results without LLM recommendation
+
+ # ------------------------------------------------------------------
+ # Step 3: Semantic diff
+ # ------------------------------------------------------------------
+
+ def compute_diff(self, old_text: str, new_text: str) -> dict:
+ """Compare old and new regulation text; return changed sections and summary."""
+ old_paras = [p.strip() for p in old_text.split("\n") if p.strip()]
+ new_paras = [p.strip() for p in new_text.split("\n") if p.strip()]
+
+ if not old_paras or not new_paras:
+ return {"changed_sections": [], "change_summary": "No comparable text."}
+
+ all_paras = old_paras + new_paras
+ try:
+ all_embeddings = self._embedder.embed_texts(all_paras)
+ except Exception as exc:
+ logger.warning("Embedding for diff failed: {}", exc)
+ return {"changed_sections": [], "change_summary": "Diff unavailable (embedding error)."}
+
+ old_embeddings = all_embeddings[: len(old_paras)]
+ new_embeddings = all_embeddings[len(old_paras):]
+
+ # Pair paragraphs by position (zip — handles length differences)
+ changed_sections: list[dict] = []
+ for i, (old_emb, new_emb, old_p, new_p) in enumerate(
+ zip(old_embeddings, new_embeddings, old_paras, new_paras)
+ ):
+ sim = _cosine(old_emb, new_emb)
+ if sim < _SIMILARITY_THRESHOLD:
+ messages = [
+ {"role": "system", "content": _DIFF_SYSTEM},
+ {"role": "user", "content": f"OLD: {old_p[:500]}\nNEW: {new_p[:500]}"},
+ ]
+ classification = _llm_json(self._client, messages) or {}
+ changed_sections.append({
+ "old_text": old_p[:300],
+ "new_text": new_p[:300],
+ "similarity": round(sim, 3),
+ "change_type": classification.get("change_type", "modified"),
+ "summary": classification.get("summary", ""),
+ })
+
+ if not changed_sections:
+ change_summary = "No substantive changes detected between versions."
+ else:
+ types = [s["change_type"] for s in changed_sections]
+ change_summary = (
+ f"{len(changed_sections)} paragraph(s) changed: "
+ + ", ".join(f"{t}" for t in set(types))
+ + ". "
+ + (changed_sections[0].get("summary", "") if changed_sections else "")
+ )
+
+ return {"changed_sections": changed_sections, "change_summary": change_summary}
+```
+
+- [ ] **Step 4: Run tests**
+
+```
+cd backend && PYTHONPATH=. pytest tests/perception/test_llm_pipeline.py -v
+```
+Expected: 4 tests PASS
+
+---
+
+## Task 6: CrawlService
+
+**Files:**
+- Create: `backend/app/application/perception/crawl_service.py`
+- Create: `backend/tests/perception/test_crawl_service.py`
+
+- [ ] **Step 1: Write the failing test**
+
+```python
+# backend/tests/perception/test_crawl_service.py
+"""Integration tests for CrawlService."""
+from __future__ import annotations
+from unittest.mock import MagicMock
+import hashlib
+import pytest
+
+from app.infrastructure.perception.crawlers.base import RawEvent
+from app.infrastructure.perception.mock_event_store import MockEventStore
+
+
+def _make_raw_event(code="TST-001"):
+ return RawEvent(
+ source="TEST", source_label="Test", standard_code=code,
+ title=f"Test {code}", summary="Summary", full_text_url="https://example.com",
+ status="enacted", published_at="2026-01-01", effective_at=None,
+ category="test", tags=["test"], raw_text="full text",
+ )
+
+
+def _make_service(raw_events):
+ from app.application.perception.crawl_service import CrawlService
+
+ mock_crawler = MagicMock()
+ mock_crawler.fetch.return_value = raw_events
+
+ mock_pipeline = MagicMock()
+ mock_pipeline.extract_structure.return_value = {
+ "obligations": [], "deadlines": [], "scope": "test",
+ "penalties": None, "impact_level": "low",
+ }
+ mock_pipeline.assess_impact.return_value = []
+ mock_pipeline.compute_diff.return_value = {
+ "changed_sections": [], "change_summary": "No changes.",
+ }
+
+ mock_retrieval = MagicMock()
+ store = MockEventStore()
+
+ return CrawlService(
+ crawlers={"TEST": mock_crawler},
+ event_store=store,
+ llm_pipeline=mock_pipeline,
+ retrieval_service=mock_retrieval,
+ )
+
+
+def test_crawl_yields_progress_and_done():
+ svc = _make_service([_make_raw_event("TST-001")])
+ events = list(svc.run_crawl())
+ event_types = [e.get("event") for e in events]
+ assert "done" in event_types
+
+
+def test_crawl_upserts_to_store():
+ store = MockEventStore()
+ from app.application.perception.crawl_service import CrawlService
+ mock_crawler = MagicMock()
+ mock_crawler.fetch.return_value = [_make_raw_event("NEW-001")]
+ mock_pipeline = MagicMock()
+ mock_pipeline.extract_structure.return_value = {
+ "obligations": [], "deadlines": [], "scope": "",
+ "penalties": None, "impact_level": "medium",
+ }
+ mock_pipeline.assess_impact.return_value = []
+ mock_pipeline.compute_diff.return_value = {
+ "changed_sections": [], "change_summary": "",
+ }
+ svc = CrawlService(
+ crawlers={"TEST": mock_crawler},
+ event_store=store,
+ llm_pipeline=mock_pipeline,
+ retrieval_service=MagicMock(),
+ )
+ list(svc.run_crawl())
+ result = store.get_by_standard_code("NEW-001")
+ assert result is not None
+ assert result["title"] == "Test NEW-001"
+
+
+def test_crawl_skips_unchanged_events():
+ store = MockEventStore()
+ raw = _make_raw_event("SKIP-001")
+ content_hash = hashlib.sha256(raw.raw_text.encode()).hexdigest()
+ # Pre-seed with same hash
+ store.upsert({
+ "id": hashlib.sha256(f"TEST-SKIP-001".encode()).hexdigest()[:12],
+ "standard_code": "SKIP-001",
+ "source": "TEST",
+ "source_label": "Test",
+ "title": "Test SKIP-001",
+ "summary": "",
+ "full_text_url": "",
+ "status": "enacted",
+ "impact_level": "low",
+ "published_at": "2026-01-01",
+ "effective_at": None,
+ "category": "test",
+ "tags": [],
+ "content_hash": content_hash,
+ })
+ mock_pipeline = MagicMock()
+ from app.application.perception.crawl_service import CrawlService
+ mock_crawler = MagicMock()
+ mock_crawler.fetch.return_value = [raw]
+ svc = CrawlService(
+ crawlers={"TEST": mock_crawler},
+ event_store=store,
+ llm_pipeline=mock_pipeline,
+ retrieval_service=MagicMock(),
+ )
+ list(svc.run_crawl())
+ # pipeline should NOT have been called for unchanged event
+ mock_pipeline.extract_structure.assert_not_called()
+```
+
+- [ ] **Step 2: Run to verify it fails**
+
+```
+cd backend && PYTHONPATH=. pytest tests/perception/test_crawl_service.py -v
+```
+Expected: ImportError
+
+- [ ] **Step 3: Implement CrawlService**
+
+```python
+# backend/app/application/perception/crawl_service.py
+"""Orchestrates regulatory source crawlers and LLM enrichment pipeline."""
+
+from __future__ import annotations
+
+import hashlib
+from typing import Any, Generator
+
+from loguru import logger
+
+from app.infrastructure.perception.base_event_store import BaseEventStore
+from app.infrastructure.perception.crawlers.base import BaseCrawler, RawEvent
+from app.infrastructure.perception.llm_pipeline import LlmPipeline
+
+
+def _event_id(source: str, standard_code: str) -> str:
+ """Deterministic 12-char ID from source + standard_code."""
+ return hashlib.sha256(f"{source}-{standard_code}".encode()).hexdigest()[:12]
+
+
+def _content_hash(raw_text: str) -> str:
+ return hashlib.sha256(raw_text.encode()).hexdigest()
+
+
+def _raw_to_dict(raw: RawEvent, event_id: str, content_hash: str) -> dict:
+ return {
+ "id": event_id,
+ "source": raw.source,
+ "source_label": raw.source_label,
+ "standard_code": raw.standard_code,
+ "title": raw.title,
+ "summary": raw.summary,
+ "full_text_url": raw.full_text_url,
+ "status": raw.status,
+ "impact_level": "medium", # updated by LLM pipeline
+ "published_at": raw.published_at,
+ "effective_at": raw.effective_at,
+ "category": raw.category,
+ "tags": raw.tags,
+ "content_hash": content_hash,
+ "previous_hash": None,
+ }
+
+
+class CrawlService:
+ """Orchestrate crawlers, hash-based change detection, and LLM enrichment."""
+
+ def __init__(
+ self,
+ crawlers: dict[str, BaseCrawler],
+ event_store: BaseEventStore,
+ llm_pipeline: LlmPipeline,
+ retrieval_service: Any,
+ ) -> None:
+ self._crawlers = crawlers
+ self._store = event_store
+ self._pipeline = llm_pipeline
+ self._retrieval = retrieval_service
+
+ def run_crawl(
+ self, sources: list[str] | None = None
+ ) -> Generator[dict, None, None]:
+ """Run crawl for selected sources. Yields SSE-ready progress dicts."""
+ targets = sources or list(self._crawlers.keys())
+ total_new = 0
+ total_updated = 0
+
+ for source_key in targets:
+ crawler = self._crawlers.get(source_key)
+ if not crawler:
+ yield {"event": "error", "data": f"Unknown source: {source_key}"}
+ continue
+
+ yield {"event": "progress", "data": {"source": source_key, "stage": "fetching"}}
+ try:
+ raw_events = crawler.fetch(limit=100)
+ except Exception as exc:
+ logger.exception("Crawler failed source={}", source_key)
+ yield {"event": "error", "data": {"source": source_key, "message": str(exc)}}
+ continue
+
+ yield {
+ "event": "progress",
+ "data": {"source": source_key, "stage": "processing", "fetched": len(raw_events)},
+ }
+
+ new_count = 0
+ updated_count = 0
+
+ for raw in raw_events:
+ eid = _event_id(raw.source, raw.standard_code)
+ new_hash = _content_hash(raw.raw_text or raw.title)
+ existing = self._store.get(eid)
+
+ if existing and existing.get("content_hash") == new_hash:
+ # Unchanged — skip LLM processing
+ continue
+
+ is_update = existing is not None
+ old_text = existing.get("summary", "") if is_update else ""
+ previous_hash = existing.get("content_hash") if is_update else None
+
+ event_dict = _raw_to_dict(raw, eid, new_hash)
+ event_dict["previous_hash"] = previous_hash
+
+ # Step 1: Structure extraction
+ try:
+ structure = self._pipeline.extract_structure(event_dict)
+ event_dict.update(structure)
+ except Exception as exc:
+ logger.warning("Structure extraction failed id={} err={}", eid, exc)
+
+ # Step 2: Impact assessment
+ try:
+ affected = self._pipeline.assess_impact(event_dict, self._retrieval)
+ event_dict["affected_docs"] = affected
+ except Exception as exc:
+ logger.warning("Impact assessment failed id={} err={}", eid, exc)
+
+ # Step 3: Semantic diff (only when updating existing event)
+ if is_update and old_text and raw.raw_text:
+ try:
+ diff = self._pipeline.compute_diff(old_text, raw.raw_text)
+ event_dict["change_summary"] = diff.get("change_summary")
+ event_dict["changed_sections"] = diff.get("changed_sections")
+ except Exception as exc:
+ logger.warning("Diff failed id={} err={}", eid, exc)
+
+ self._store.upsert(event_dict)
+
+ if is_update:
+ updated_count += 1
+ else:
+ new_count += 1
+
+ total_new += new_count
+ total_updated += updated_count
+
+ yield {
+ "event": "progress",
+ "data": {
+ "source": source_key,
+ "stage": "done",
+ "new": new_count,
+ "updated": updated_count,
+ },
+ }
+
+ yield {
+ "event": "done",
+ "data": {"total_new": total_new, "total_updated": total_updated},
+ }
+```
+
+- [ ] **Step 4: Run tests**
+
+```
+cd backend && PYTHONPATH=. pytest tests/perception/test_crawl_service.py -v
+```
+Expected: 3 tests PASS
+
+---
+
+## Task 7: Wire bootstrap + add settings + update PerceptionService type hint
+
+**Files:**
+- Modify: `backend/app/config/settings.py`
+- Modify: `backend/app/shared/bootstrap.py`
+- Modify: `backend/app/application/perception/services.py`
+- Modify: `backend/requirements.txt`
+- Modify: `backend/.env`
+- Modify: `backend/.env.example`
+
+- [ ] **Step 1: Add settings**
+
+In `backend/app/config/settings.py`, after the `use_celery_worker` field (line ~88), add:
+
+```python
+ # ── Perception crawl ──────────────────────────────────────────────────────
+ perception_crawl_timeout_seconds: int = Field(
+ default=120, description="HTTP timeout for regulatory source crawlers."
+ )
+ perception_max_events_per_source: int = Field(
+ default=100, description="Maximum events fetched per source per crawl run."
+ )
+ perception_diff_similarity_threshold: float = Field(
+ default=0.85,
+ description="Cosine similarity below which a paragraph is flagged as changed.",
+ )
+```
+
+- [ ] **Step 2: Add env vars to .env and .env.example**
+
+Add to `backend/.env` (after `USE_CELERY_WORKER=false`):
+```
+PERCEPTION_CRAWL_TIMEOUT_SECONDS=120
+PERCEPTION_MAX_EVENTS_PER_SOURCE=100
+PERCEPTION_DIFF_SIMILARITY_THRESHOLD=0.85
+```
+
+Add the same block to `.env.example`.
+
+- [ ] **Step 3: Fix type hint in PerceptionService**
+
+In `backend/app/application/perception/services.py`, change:
+
+```python
+from app.infrastructure.perception.mock_event_store import MockEventStore
+```
+to:
+```python
+from app.infrastructure.perception.base_event_store import BaseEventStore
+```
+
+Change constructor type hint from:
+```python
+ def __init__(
+ self,
+ event_store: MockEventStore,
+ retrieval_service: KnowledgeRetrievalService,
+ ) -> None:
+```
+to:
+```python
+ def __init__(
+ self,
+ event_store: BaseEventStore,
+ retrieval_service: KnowledgeRetrievalService,
+ ) -> None:
+```
+
+- [ ] **Step 4: Wire bootstrap.py**
+
+At the top of `backend/app/shared/bootstrap.py`, after existing imports, add:
+
+```python
+from app.application.perception.crawl_service import CrawlService
+from app.infrastructure.perception.base_event_store import BaseEventStore
+from app.infrastructure.perception.crawlers.catarc_crawler import CatarcCrawler
+from app.infrastructure.perception.crawlers.guobiao_crawler import (
+ GuobiaoMandatoryCrawler,
+ GuobiaoRecommendedCrawler,
+)
+from app.infrastructure.perception.crawlers.eurlex_crawler import EurlexCrawler
+from app.infrastructure.perception.llm_pipeline import LlmPipeline
+```
+
+Replace the existing `get_perception_service()` function:
+
+```python
+@lru_cache
+def _get_event_store() -> BaseEventStore:
+ """Return event store selected by DOCUMENT_REPOSITORY_BACKEND setting."""
+ if settings.document_repository_backend == "postgres":
+ from app.infrastructure.perception.postgres_event_store import PostgresEventStore
+ return PostgresEventStore()
+ return MockEventStore()
+
+
+@lru_cache
+def get_perception_service() -> PerceptionService:
+ """Return perception service for regulatory intelligence."""
+ return PerceptionService(
+ event_store=_get_event_store(),
+ retrieval_service=get_retrieval_service(),
+ )
+
+
+@lru_cache
+def get_crawl_service() -> CrawlService:
+ """Return CrawlService wired with all registered crawlers and LLM pipeline."""
+ crawlers = {
+ "CATARC": CatarcCrawler(),
+ "国标委·强制性": GuobiaoMandatoryCrawler(),
+ "国标委·推荐性": GuobiaoRecommendedCrawler(),
+ "EUR-Lex": EurlexCrawler(),
+ }
+ return CrawlService(
+ crawlers=crawlers,
+ event_store=_get_event_store(),
+ llm_pipeline=LlmPipeline(),
+ retrieval_service=get_retrieval_service(),
+ )
+```
+
+- [ ] **Step 5: Add beautifulsoup4 + lxml to requirements.txt**
+
+After the `httpx>=0.25.0` line in `backend/requirements.txt`, add:
+
+```
+beautifulsoup4>=4.12.0
+lxml>=5.0.0
+```
+
+- [ ] **Step 6: Verify imports work**
+
+```
+cd backend && PYTHONPATH=. python -c "from app.shared.bootstrap import get_crawl_service; print('ok')"
+```
+Expected: `ok`
+
+---
+
+## Task 8: New API endpoints (crawl + process + diff)
+
+**Files:**
+- Modify: `backend/app/api/routes/perception.py`
+
+- [ ] **Step 1: Add three new endpoints**
+
+Open `backend/app/api/routes/perception.py`. After the existing `analyze_event` endpoint, add:
+
+```python
+from fastapi import Depends
+from app.api.dependencies.auth import get_current_user
+from app.domain.auth.models import UserClaims
+from app.shared.bootstrap import get_crawl_service
+
+
+@router.post("/crawl")
+async def run_crawl(
+ body: dict = None,
+ current_user: UserClaims = Depends(get_current_user),
+):
+ """Trigger manual crawl of regulatory sources. Streams SSE progress.
+
+ Body (optional): {"sources": ["CATARC", "国标委·强制性", "EUR-Lex"]}
+ Omit sources to crawl all registered sources.
+ """
+ sources: list[str] | None = (body or {}).get("sources")
+ crawl_svc = get_crawl_service()
+
+ async def crawl_stream():
+ async for item in iter_in_thread(crawl_svc.run_crawl(sources=sources)):
+ event_name = item.get("event", "message")
+ data = item.get("data", "")
+ if isinstance(data, (dict, list)):
+ data = json.dumps(data, ensure_ascii=False)
+ yield f"event: {event_name}\ndata: {data}\n\n"
+
+ return StreamingResponse(
+ crawl_stream(),
+ media_type="text/event-stream",
+ headers={"Cache-Control": "no-cache", "X-Accel-Buffering": "no"},
+ )
+
+
+@router.post("/events/{event_id}/process")
+async def process_event(
+ event_id: str,
+ current_user: UserClaims = Depends(get_current_user),
+):
+ """Trigger LLM pipeline (extract + assess + diff) for a single event."""
+ from datetime import UTC, datetime
+ from app.infrastructure.perception.llm_pipeline import LlmPipeline
+ from app.shared.bootstrap import get_retrieval_service
+
+ event = get_perception_service().get_event(event_id)
+ if not event:
+ from fastapi import HTTPException
+ raise HTTPException(status_code=404, detail=f"Event {event_id} not found")
+
+ store = get_crawl_service()._store # share the same store instance
+ pipeline = LlmPipeline()
+
+ structure = pipeline.extract_structure(event)
+ event.update(structure)
+ event["affected_docs"] = pipeline.assess_impact(event, get_retrieval_service())
+ event["processed_at"] = datetime.now(UTC).isoformat()
+ store.upsert(event)
+
+ return {"status": "ok", "event_id": event_id, "processed_at": event["processed_at"]}
+
+
+@router.get("/events/{event_id}/diff")
+async def get_event_diff(event_id: str):
+ """Return semantic diff detail for an event (only available if previously crawled twice)."""
+ event = get_perception_service().get_event(event_id)
+ if not event:
+ from fastapi import HTTPException
+ raise HTTPException(status_code=404, detail=f"Event {event_id} not found")
+ if not event.get("change_summary"):
+ from fastapi import HTTPException
+ raise HTTPException(status_code=404, detail="No diff available for this event")
+ return {
+ "event_id": event_id,
+ "change_summary": event.get("change_summary"),
+ "changed_sections": event.get("changed_sections") or [],
+ "previous_hash": event.get("previous_hash"),
+ "content_hash": event.get("content_hash"),
+ }
+```
+
+- [ ] **Step 2: Smoke test with curl (backend running)**
+
+```bash
+# With backend running (./dev.sh start api):
+curl -s -H "Authorization: Bearer $TOKEN" \
+ http://localhost:8000/api/v1/perception/stats | python -m json.tool
+```
+Expected: JSON with `total`, `high_impact`, `medium_impact`, `recent_90d`.
+
+---
+
+## Task 9: Frontend — Crawl Bar + Detail Tabs
+
+**Files:**
+- Modify: `frontend/src/pages/Perception/PerceptionPage.tsx`
+
+- [ ] **Step 1: Add CrawlBar state and handler at the top of PerceptionPage**
+
+In `PerceptionPage.tsx`, after the existing `abortRef` line (~line 107), add:
+
+```tsx
+ const [crawling, setCrawling] = useState(false);
+ const [crawlStatus, setCrawlStatus] = useState('');
+ const [detailTab, setDetailTab] = useState<'overview'|'obligations'|'assessment'|'diff'>('overview');
+
+ // Extended signal shape from DB (populated after crawl)
+ const [selectedFull, setSelectedFull] = useState | null>(null);
+
+ async function fetchFullEvent(id: string) {
+ try {
+ const res = await fetch(`/api/v1/perception/events/${id}`, { headers: authHeader() });
+ if (res.ok) setSelectedFull(await res.json());
+ } catch { /* ignore */ }
+ }
+```
+
+- [ ] **Step 2: Add runCrawl function**
+
+After `stopAnalysis()`, add:
+
+```tsx
+ async function runCrawl() {
+ setCrawling(true);
+ setCrawlStatus('正在连接数据源...');
+ try {
+ const res = await fetch('/api/v1/perception/crawl', {
+ method: 'POST',
+ headers: { 'Content-Type': 'application/json', ...authHeader() },
+ body: JSON.stringify({}),
+ });
+ if (!res.body) { setCrawlStatus('No stream'); setCrawling(false); return; }
+ const reader = res.body.getReader();
+ const dec = new TextDecoder();
+ let buf = '';
+ while (true) {
+ const { done, value } = await reader.read();
+ if (done) break;
+ buf += dec.decode(value);
+ const parts = buf.split('\n\n');
+ buf = parts.pop() ?? '';
+ for (const block of parts) {
+ const eventLine = block.split('\n').find(l => l.startsWith('event: '));
+ const dataLine = block.split('\n').find(l => l.startsWith('data: '));
+ const evtName = eventLine?.slice(7).trim();
+ const raw = dataLine?.slice(6).trim();
+ if (!raw) continue;
+ try {
+ const d = JSON.parse(raw);
+ if (evtName === 'progress') {
+ setCrawlStatus(`${d.source}: ${d.stage === 'fetching' ? '抓取中...' : d.stage === 'processing' ? `处理 ${d.fetched} 条...` : `完成 +${d.new} 条`}`);
+ } else if (evtName === 'done') {
+ setCrawlStatus(`更新完成 — 新增 ${d.total_new} 条,更新 ${d.total_updated} 条`);
+ // refresh event list
+ fetch('/api/v1/perception/events?limit=100', { headers: authHeader() })
+ .then(r => r.json())
+ .then(d2 => { if (Array.isArray(d2?.events)) setSignals(d2.events.map(mapEvent)); });
+ } else if (evtName === 'error') {
+ setCrawlStatus(`错误: ${typeof d === 'string' ? d : d.message}`);
+ }
+ } catch { /* ignore */ }
+ }
+ }
+ } catch (e: unknown) {
+ setCrawlStatus(`连接失败: ${e instanceof Error ? e.message : String(e)}`);
+ }
+ setCrawling(false);
+ }
+```
+
+- [ ] **Step 3: Update selectSignal to also fetch full event**
+
+Replace:
+```tsx
+ function selectSignal(sig: Signal) {
+ setSelected(sig);
+ setAiOutput('');
+ setStreaming(false);
+ }
+```
+with:
+```tsx
+ function selectSignal(sig: Signal) {
+ setSelected(sig);
+ setSelectedFull(null);
+ setAiOutput('');
+ setStreaming(false);
+ setDetailTab('overview');
+ fetchFullEvent(sig.id);
+ }
+```
+
+- [ ] **Step 4: Replace Topbar Refresh button with CrawlBar**
+
+Replace the existing:
+```tsx
+
+```
+with:
+```tsx
+
+ {crawlStatus && {crawlStatus}}
+```
+
+- [ ] **Step 5: Replace right panel with tabbed detail view**
+
+Replace the entire right panel section (the `` block, roughly lines 267–319) with:
+
+```tsx
+
+ {!selected ? (
+
+
+
Select a signal to run impact analysis
+
+ ) : (
+ <>
+ {/* ── Detail header card ── */}
+
+
+ {selected.source}
+ {selected.standard}
+
+ {selected.status === 'risk' ? 'Urgent' : selected.status === 'warn' ? 'Draft' : 'Published'}
+
+ {selectedFull?.change_summary && (
+ CHANGED
+ )}
+
+
{selected.title}
+
{selected.summary}
+
+ {!streaming
+ ?
+ :
+ }
+ {selected && (
+
+ Source
+
+ )}
+
+
+
+ {/* ── Tab bar ── */}
+
+ {(['overview', 'obligations', 'assessment', 'diff'] as const).map(tab => (
+
+ ))}
+
+
+ {/* ── Tab content ── */}
+ {detailTab === 'overview' && (
+
+
Scope & Summary
+
+ {(selectedFull?.scope as string) || selected.summary}
+
+ {selectedFull?.penalties && (
+
+ ⚠ {selectedFull.penalties as string}
+
+ )}
+
+ )}
+
+ {detailTab === 'obligations' && (
+
+
义务条款
+ {(() => {
+ const obs = (selectedFull?.obligations as Array
>) || [];
+ const deadlines = (selectedFull?.deadlines as Array>) || [];
+ return obs.length === 0 && deadlines.length === 0 ? (
+ 暂无结构化数据。点击右上角"Run impact analysis"触发提取。
+ ) : (
+ <>
+ {obs.length > 0 && (
+
+
+
+ | 义务描述 |
+ 主体 |
+ 类型 |
+
+
+
+ {obs.map((ob, i) => (
+
+ | {ob.text} |
+ {ob.subject} |
+
+
+ {ob.deontic}
+
+ |
+
+ ))}
+
+
+ )}
+ {deadlines.length > 0 && (
+
+
截止日期
+ {deadlines.map((d, i) => (
+
+ {d.date || '待定'}
+ {d.description}
+
+ ))}
+
+ )}
+ >
+ );
+ })()}
+
+ )}
+
+ {detailTab === 'assessment' && (
+
+
Affected documents
+ {(() => {
+ const docs = (selectedFull?.affected_docs as Array
>) || MOCK_DOCS.map(d => ({ doc_name: d.name, score: d.score / 100, key_clauses: d.clause, snippet: d.snippet, recommendation: '' }));
+ return docs.length === 0
+ ? No affected documents found.
+ : docs.map((d, i) => (
+
+
{Math.round(Number(d.score ?? 0) * 100)}%
+
+
+ {String(d.doc_name || '')}
+ {String(d.key_clauses || d.clause || '')}
+
+ {d.snippet &&
{String(d.snippet)}
}
+ {d.recommendation && (
+
→ {String(d.recommendation)}
+ )}
+
+
+ ));
+ })()}
+
+ )}
+
+ {detailTab === 'diff' && selectedFull?.change_summary && (
+
+
变更对比
+
+ {selectedFull.change_summary as string}
+
+ {(() => {
+ const sections = (selectedFull.changed_sections as Array
>) || [];
+ return sections.map((s, i) => (
+
+
+
+ {String(s.change_type)}
+
+ cosine: {String(s.similarity)}
+
+
+
+
旧版
+ {String(s.old_text)}
+
+
+
新版
+ {String(s.new_text)}
+
+
+ {s.summary &&
{String(s.summary)}
}
+
+ ));
+ })()}
+
+ )}
+
+ {/* ── AI Analysis card (unchanged) ── */}
+ {(aiOutput || streaming) && (
+
+
AI Impact Analysis
+
+ {aiOutput}
+ {streaming && ▋}
+
+
+ )}
+ >
+ )}
+
+```
+
+- [ ] **Step 6: Add CSS for tabs and spin animation**
+
+In `frontend/src/styles/globals.css`, append at the end:
+
+```css
+/* ── Perception detail tabs ── */
+.detail-tabs {
+ display: flex;
+ gap: 2px;
+ margin: 8px 0 0;
+ border-bottom: 1px solid var(--border);
+ padding-bottom: 0;
+}
+.detail-tab {
+ background: none;
+ border: none;
+ border-bottom: 2px solid transparent;
+ padding: 6px 14px;
+ font-size: 13px;
+ color: var(--text-secondary);
+ cursor: pointer;
+ transition: color 0.15s, border-color 0.15s;
+}
+.detail-tab:hover { color: var(--text); }
+.detail-tab.active {
+ color: var(--accent);
+ border-bottom-color: var(--accent);
+ font-weight: 600;
+}
+.detail-tab.disabled {
+ opacity: 0.35;
+ cursor: not-allowed;
+}
+
+/* ── Spin animation for crawl refresh icon ── */
+@keyframes spin { from { transform: rotate(0deg); } to { transform: rotate(360deg); } }
+.spin { animation: spin 1s linear infinite; }
+```
+
+- [ ] **Step 7: Verify TypeScript compiles**
+
+```
+cd frontend && npx tsc --noEmit
+```
+Expected: no errors (or only pre-existing errors unrelated to PerceptionPage)
+
+---
+
+## Task 10: Install new Python dependencies
+
+**Files:**
+- Modify: `backend/requirements.txt` (already done in Task 7)
+
+- [ ] **Step 1: Install on server**
+
+```bash
+# On the server (in project root):
+.venv/bin/pip install beautifulsoup4>=4.12.0 lxml>=5.0.0
+```
+
+- [ ] **Step 2: Verify import**
+
+```bash
+PYTHONPATH=backend .venv/bin/python -c "from bs4 import BeautifulSoup; print('ok')"
+```
+Expected: `ok`
+
+- [ ] **Step 3: Run all perception tests**
+
+```
+cd backend && PYTHONPATH=. pytest tests/perception/ -v
+```
+Expected: all tests PASS
+
+---
+
+## Task 11: End-to-end verification
+
+- [ ] **Step 1: Start backend**
+
+```bash
+./dev.sh start api
+```
+
+- [ ] **Step 2: Verify stats endpoint still works**
+
+```bash
+TOKEN=$(curl -s -X POST http://localhost:8000/api/v1/auth/login \
+ -H "Content-Type: application/json" \
+ -d '{"username":"admin","password":"Admin@2026!"}' | python -m json.tool | grep access_token | cut -d'"' -f4)
+
+curl -s -H "Authorization: Bearer $TOKEN" \
+ http://localhost:8000/api/v1/perception/stats | python -m json.tool
+```
+Expected: `{"total": ..., "high_impact": ..., ...}`
+
+- [ ] **Step 3: Trigger manual crawl (with DOCUMENT_REPOSITORY_BACKEND=json, uses MockEventStore)**
+
+```bash
+curl -s -X POST \
+ -H "Authorization: Bearer $TOKEN" \
+ -H "Content-Type: application/json" \
+ http://localhost:8000/api/v1/perception/crawl \
+ -d '{"sources":["CATARC"]}' --no-buffer
+```
+Expected: SSE stream with `event: progress` lines followed by `event: done`
+
+- [ ] **Step 4: Switch to postgres backend and re-verify (if PostgreSQL available)**
+
+In `.env`, set `DOCUMENT_REPOSITORY_BACKEND=postgres`, restart API, then repeat Step 2 and 3. Verify events appear in `regulation_events` table:
+
+```bash
+psql -h 6.86.80.8 -U postgresql -d compliance_db -c "SELECT COUNT(*) FROM regulation_events;"
+```
+
+- [ ] **Step 5: Build frontend on server**
+
+```bash
+cd frontend && npm install && npm run build
+```
+Expected: build succeeds
+
+- [ ] **Step 6: Open browser, navigate to Regulatory Signals page**
+
+Verify:
+- Stats bar shows real counts
+- "刷新数据源" button is visible in topbar
+- Clicking a signal shows 概览 / 义务条款 / 影响评估 / 变更对比 tabs
+- 变更对比 tab is greyed out until a second crawl detects a change
+
+---
+
+## Self-Review
+
+**Spec coverage check:**
+
+| Spec requirement | Task |
+|-----------------|------|
+| Replace MockEventStore → PostgresEventStore | Tasks 1, 2, 7 |
+| BaseEventStore ABC as port | Task 1 |
+| CATARC crawler | Task 3 |
+| 国标委 strong + recommended crawlers | Task 3 |
+| EUR-Lex RSS crawler | Task 4 |
+| LLM structure extraction | Task 5 |
+| LLM impact assessment (RAG) | Task 5 |
+| Semantic diff via embedding | Task 5 |
+| CrawlService with hash-based skip | Task 6 |
+| bootstrap.py wiring + settings | Task 7 |
+| POST /crawl SSE endpoint | Task 8 |
+| POST /events/{id}/process endpoint | Task 8 |
+| GET /events/{id}/diff endpoint | Task 8 |
+| Frontend crawl bar + progress | Task 9 |
+| Frontend detail tabs (4 tabs) | Task 9 |
+| Changed badge on signal cards | Task 9 (CHANGED badge in header) |
+| Real affected_docs replacing MOCK_DOCS | Task 9 |
+| New Python dependencies | Task 10 |
+| E2E verification | Task 11 |
+
+All spec requirements covered. No placeholders found.
diff --git a/docs/superpowers/specs/2026-06-05-perception-intelligence-design.md b/docs/superpowers/specs/2026-06-05-perception-intelligence-design.md
new file mode 100644
index 0000000..575f76c
--- /dev/null
+++ b/docs/superpowers/specs/2026-06-05-perception-intelligence-design.md
@@ -0,0 +1,328 @@
+# Regulatory Signals Intelligence Enhancement — Design Spec
+
+> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking.
+
+**Goal:** Replace the 20-item hardcoded MockEventStore with real regulatory data from Chinese and international sources, add LLM-driven structured extraction, impact assessment, and semantic change diff — all accessible through a manual-trigger crawl in the frontend.
+
+**Architecture:** Crawler Service (httpx + BeautifulSoup) → PostgreSQL EventStore → LLM Pipeline (extract → assess → diff) → existing PerceptionService interface. New code follows `api → application → domain ports → infrastructure` layering; no new files in `services/*` or `workflows/*`; `shared/bootstrap.py` is the composition root.
+
+**Tech Stack:** httpx, BeautifulSoup4, sentence-transformers (for diff), existing LLM factory (deepseek/qwen), existing KnowledgeRetrievalService (RAG), PostgreSQL (already available), existing SSE infrastructure.
+
+---
+
+## 1. Data Sources
+
+| Source | URL | Method | Coverage |
+|--------|-----|--------|----------|
+| CATARC 汽车标准 | `https://www.catarc.org.cn/bzzxd/qcbz/index.html` | httpx + BeautifulSoup (static pages) | 国家/行业汽车标准列表 |
+| 国标委强制性标准 | `https://openstd.samr.gov.cn/bzgk/std/std_list_type?p.p1=1&p.p2=车&p.p90=circulation_date&p.p91=desc` | httpx + JSON API parse | 强制性国家标准,按"车"过滤 |
+| 国标委推荐性标准 | `https://openstd.samr.gov.cn/bzgk/std/std_list_type?p.p1=2&p.p2=车&p.p90=circulation_date&p.p91=desc` | httpx + JSON API parse | 推荐性国家标准,按"车"过滤 |
+| EUR-Lex | RSS + CELLAR REST API | pyeurlex / httpx | EU AI Act, automotive directives |
+| UN R155/R156 | CELLAR REST API (CELEX lookup) | httpx | UN-ECE cybersecurity/OTA regulations |
+
+Crawl is **manual-trigger only** — no cron/Celery Beat. Admin clicks "刷新数据源" in the frontend UI.
+
+---
+
+## 2. Database Schema
+
+### New table: `regulation_events`
+
+```sql
+CREATE TABLE IF NOT EXISTS regulation_events (
+ id TEXT PRIMARY KEY, -- sha256(source + standard_code)[:12]
+ source TEXT NOT NULL, -- 'CATARC' | '国标委' | 'EUR-Lex' | 'UN-ECE'
+ source_label TEXT, -- Human-readable source label
+ standard_code TEXT NOT NULL, -- e.g. "GB 18384-2025", "EU/2024/1689"
+ title TEXT NOT NULL,
+ summary TEXT, -- Crawled abstract or first paragraph
+ full_text_url TEXT, -- Original page URL
+ status TEXT, -- 'enacted' | 'draft' | 'consultation'
+ impact_level TEXT, -- 'high' | 'medium' | 'low' (LLM-assigned)
+ published_at DATE,
+ effective_at DATE,
+ category TEXT,
+ tags TEXT[],
+ -- LLM structured extraction
+ obligations JSONB, -- [{text, deontic, subject, object, condition}]
+ deadlines JSONB, -- [{date, description}]
+ scope TEXT, -- Applicability scope summary
+ penalties TEXT, -- Penalty / consequence summary
+ -- Change tracking
+ content_hash TEXT, -- SHA256 of crawled full text
+ previous_hash TEXT, -- Hash from prior crawl (NULL on first crawl)
+ change_summary TEXT, -- LLM-generated description of changes
+ changed_sections JSONB, -- [{old_text, new_text, change_type}] where cosine<0.85
+ -- Impact assessment
+ affected_docs JSONB, -- [{doc_id, doc_name, score, key_clauses, recommendation}]
+ -- Metadata
+ crawled_at TIMESTAMPTZ DEFAULT now(),
+ processed_at TIMESTAMPTZ,
+ raw_storage_key TEXT -- MinIO path for raw HTML/PDF (optional)
+);
+
+CREATE INDEX IF NOT EXISTS regulation_events_source_date
+ ON regulation_events (source, published_at DESC);
+CREATE INDEX IF NOT EXISTS regulation_events_impact_date
+ ON regulation_events (impact_level, published_at DESC);
+CREATE INDEX IF NOT EXISTS regulation_events_tags
+ ON regulation_events USING gin(tags);
+```
+
+---
+
+## 3. Backend Architecture
+
+### 3.1 File Map
+
+**New files (infrastructure layer):**
+- `backend/app/infrastructure/perception/crawlers/catarc_crawler.py` — CATARC scraper
+- `backend/app/infrastructure/perception/crawlers/guobiao_crawler.py` — 国标委 JSON API crawler
+- `backend/app/infrastructure/perception/crawlers/eurlex_crawler.py` — EUR-Lex RSS + CELLAR
+- `backend/app/infrastructure/perception/crawlers/base.py` — Abstract base class
+- `backend/app/infrastructure/perception/postgres_event_store.py` — PostgresEventStore (replaces MockEventStore)
+- `backend/app/infrastructure/perception/llm_pipeline.py` — Extract / assess / diff pipeline
+
+**New files (application layer):**
+- `backend/app/application/perception/crawl_service.py` — Orchestrates crawlers + LLM pipeline, exposes `run_crawl(sources)` + progress generator
+
+**Modified files:**
+- `backend/app/api/routes/perception.py` — Add `POST /crawl`, `GET /crawl/status` (SSE), `POST /events/{id}/process`, `GET /events/{id}/diff`
+- `backend/app/shared/bootstrap.py` — Wire `PostgresEventStore` + `CrawlService` + `LlmPipeline` when `DOCUMENT_REPOSITORY_BACKEND=postgres`; fallback to `MockEventStore` when `json`
+- `backend/app/config/settings.py` — Add `perception_crawl_timeout_seconds`, `perception_max_events_per_source`
+
+**Unchanged files:**
+- `backend/app/application/perception/services.py` — `PerceptionService` interface unchanged; only `_store` swap
+- `backend/app/infrastructure/perception/mock_event_store.py` — Kept for `json` backend mode
+
+### 3.2 Domain Port (Abstract Interface)
+
+```python
+# backend/app/infrastructure/perception/base_event_store.py
+from abc import ABC, abstractmethod
+
+class BaseEventStore(ABC):
+ @abstractmethod
+ def all(self) -> list[dict]: ...
+ @abstractmethod
+ def get(self, event_id: str) -> dict | None: ...
+ @abstractmethod
+ def filter(self, source=None, impact_level=None, limit=50) -> list[dict]: ...
+ @abstractmethod
+ def stats(self) -> dict: ...
+ @abstractmethod
+ def upsert(self, event: dict) -> None: ... # new — needed for crawl writes
+ @abstractmethod
+ def get_by_standard_code(self, code: str) -> dict | None: ... # for change detection
+```
+
+`MockEventStore` and `PostgresEventStore` both implement this interface.
+
+### 3.3 Crawler Base Contract
+
+```python
+# backend/app/infrastructure/perception/crawlers/base.py
+from abc import ABC, abstractmethod
+from dataclasses import dataclass
+
+@dataclass
+class RawEvent:
+ source: str
+ source_label: str
+ standard_code: str
+ title: str
+ summary: str
+ full_text_url: str
+ status: str # 'enacted' | 'draft' | 'consultation'
+ published_at: str # YYYY-MM-DD string
+ effective_at: str | None
+ category: str
+ tags: list[str]
+ raw_text: str # full crawled text for hashing + LLM
+
+class BaseCrawler(ABC):
+ @abstractmethod
+ def fetch(self, limit: int = 50) -> list[RawEvent]: ...
+```
+
+### 3.4 LLM Pipeline
+
+```python
+# backend/app/infrastructure/perception/llm_pipeline.py
+
+class LlmPipeline:
+ """Runs three sequential LLM steps on a regulation event."""
+
+ def extract_structure(self, event: dict) -> dict:
+ """Step 1: Extract obligations, deadlines, scope, penalties, impact_level.
+
+ Returns dict with keys: obligations, deadlines, scope, penalties, impact_level.
+ Uses JSON-mode or structured prompt; model retries once on parse failure.
+ """
+
+ def assess_impact(self, event: dict, retrieval_service) -> list[dict]:
+ """Step 2: RAG-based impact on existing knowledge base documents.
+
+ Query = standard_code + title + first obligation texts.
+ Returns list of {doc_id, doc_name, score, key_clauses, recommendation}.
+ """
+
+ def compute_diff(self, old_text: str, new_text: str) -> dict:
+ """Step 3: Semantic diff between old and new regulation text.
+
+ Splits both texts by paragraph. Calls existing EmbeddingService (text-embedding-v3
+ via EMBEDDING_BASE_URL) to embed each paragraph, then computes cosine similarity.
+ Changed paragraphs (cosine < 0.85) sent to LLM for change_type classification:
+ 'tightened' | 'relaxed' | 'added' | 'removed'
+ Returns {changed_sections: [...], change_summary: str}.
+ Only called when content_hash differs from previous_hash.
+ """
+```
+
+### 3.5 CrawlService
+
+```python
+# backend/app/application/perception/crawl_service.py
+
+class CrawlService:
+ def __init__(self, crawlers, event_store, llm_pipeline, retrieval_service): ...
+
+ def run_crawl(self, sources: list[str] | None = None) -> Generator[dict, None, None]:
+ """Manual-trigger crawl. Yields progress SSE dicts:
+ {event: 'progress', data: {source, fetched, new, updated, stage}}
+ {event: 'done', data: {total_new, total_updated, duration_ms}}
+ {event: 'error', data: {source, message}}
+
+ For each crawler:
+ 1. fetch() RawEvents
+ 2. hash check vs stored event → skip if unchanged
+ 3. upsert raw event to DB
+ 4. run LLM pipeline (extract → assess → diff)
+ 5. upsert enriched event to DB
+ 6. yield progress
+ """
+```
+
+---
+
+## 4. API Endpoints
+
+### Existing (unchanged interface, new store backend)
+- `GET /api/v1/perception/stats`
+- `GET /api/v1/perception/events`
+- `GET /api/v1/perception/events/{id}`
+- `POST /api/v1/perception/events/{id}/analyze` (streaming)
+
+### New endpoints
+
+```
+POST /api/v1/perception/crawl
+ Body: { sources?: ["CATARC", "国标委", "EUR-Lex", "UN-ECE"] }
+ Response: text/event-stream (SSE)
+ Auth: requires current_user (admin/legal role)
+ Streams progress events until done or error.
+
+POST /api/v1/perception/events/{id}/process
+ Trigger LLM pipeline for a single already-crawled event.
+ Response: { status: "ok", processed_at: "..." }
+ Auth: requires current_user
+
+GET /api/v1/perception/events/{id}/diff
+ Returns: { changed_sections: [...], change_summary: str, previous_hash: str }
+ Returns 404 if no diff available (first crawl or no change detected).
+```
+
+---
+
+## 5. Frontend Changes
+
+### 5.1 New: Crawl Control Bar (top of PerceptionPage)
+
+Above the stats-bar, add a `
` component:
+- "刷新数据源" button — triggers `POST /crawl` (all sources)
+- Inline progress display: shows SSE progress events as a mini status line
+ - e.g. "CATARC: 抓取中… | 国标委: 12 条新增 | EUR-Lex: 等待中"
+- On completion: shows "更新完成 — 新增 N 条,更新 M 条"
+- Disabled while crawl is in progress (prevents double-trigger)
+
+### 5.2 Signal Card Enhancement
+
+Existing cards get two new indicators:
+- **NEW badge** — shown when `crawled_at` is within last 24h (green dot)
+- **CHANGED badge** — shown when `previous_hash != content_hash` and `change_summary` exists
+
+### 5.3 Right Panel — Structured Tab
+
+Right detail panel adds a tab bar: **概览 | 义务条款 | 影响评估 | 变更对比**
+
+**义务条款 tab:**
+- Table: 义务描述 | 主体 | 对象 | 截止日期
+- Tags for deontic type: 强制 / 禁止 / 允许
+- Shows `obligations[]` + `deadlines[]` from DB
+
+**影响评估 tab:**
+- Replaces hardcoded MOCK_DOCS with real `affected_docs[]` from DB
+- Each row: document name, similarity score (%), key clause excerpt, LLM recommendation
+- "Run fresh assessment" button → triggers `POST /events/{id}/process`
+
+**变更对比 tab:**
+- Only visible when `change_summary` is non-null
+- Top: `change_summary` text (LLM prose)
+- Below: diff table with old/new paragraph pairs, change_type badge per row
+- Hidden (tab disabled) on first-crawl events with no prior version
+
+### 5.4 Existing behavior preserved
+- `analyze` streaming (AI analysis) unchanged
+- Search/filter (source, impact) unchanged — now hits real DB data
+- Stats bar — now reflects real counts from PostgreSQL
+
+---
+
+## 6. Settings Additions
+
+```python
+# backend/app/config/settings.py additions
+perception_crawl_timeout_seconds: int = Field(default=120, ...)
+perception_max_events_per_source: int = Field(default=100, ...)
+perception_diff_similarity_threshold: float = Field(default=0.85, ...)
+```
+
+```env
+# .env additions
+PERCEPTION_CRAWL_TIMEOUT_SECONDS=120
+PERCEPTION_MAX_EVENTS_PER_SOURCE=100
+PERCEPTION_DIFF_SIMILARITY_THRESHOLD=0.85
+```
+
+---
+
+## 7. Dependencies
+
+```
+# requirements.txt additions
+httpx>=0.27.0 # already likely present; confirm
+beautifulsoup4>=4.12.0 # HTML parsing for CATARC
+lxml>=5.0.0 # BeautifulSoup parser backend
+# sentence-transformers NOT added — diff uses existing text-embedding-v3 API (EMBEDDING_BASE_URL)
+```
+
+No new infrastructure required (PostgreSQL + MinIO + Milvus already available).
+
+---
+
+## 8. Backward Compatibility
+
+- `DOCUMENT_REPOSITORY_BACKEND=json` → `bootstrap.py` uses `MockEventStore` (unchanged behavior)
+- `DOCUMENT_REPOSITORY_BACKEND=postgres` → uses `PostgresEventStore`
+- Migration: run `CREATE TABLE` SQL on first startup (idempotent `CREATE TABLE IF NOT EXISTS`)
+- Existing 20 mock events are not seeded to PostgreSQL; PostgreSQL starts empty until first crawl
+
+---
+
+## 9. Out of Scope (this phase)
+
+- Automatic/scheduled crawling (Celery Beat) — manual trigger only
+- Playwright-based JS-rendered pages — all target sites work with httpx
+- Knowledge Graph (Neo4j / LightRAG) — future phase
+- Email/Slack webhook notifications — future phase
+- User-facing diff history (versioning beyond one prior snapshot) — future phase
diff --git a/frontend/src/App.tsx b/frontend/src/App.tsx
index d6f8d06..4dbc9c1 100644
--- a/frontend/src/App.tsx
+++ b/frontend/src/App.tsx
@@ -1,12 +1,14 @@
import './styles/globals.css';
-import { ThemeProvider, AuthProvider } from './contexts';
+import { ThemeProvider, AuthProvider, PageStateProvider } from './contexts';
import { AppRouter } from './router/AppRouter';
function App() {
return (
-
+
+
+
);
diff --git a/frontend/src/contexts/PageStateContext.tsx b/frontend/src/contexts/PageStateContext.tsx
new file mode 100644
index 0000000..ba22cc9
--- /dev/null
+++ b/frontend/src/contexts/PageStateContext.tsx
@@ -0,0 +1,211 @@
+/**
+ * PageStateContext — preserves page-level session state across route changes.
+ *
+ * When React Router unmounts a page component, all its useState values are lost.
+ * This context lives above the router and holds the state that must survive
+ * navigation so users can switch modules and return without losing their work.
+ *
+ * Covered pages:
+ * - RagChat: message history, citation rail, sessionId, input draft
+ * - Compliance: analysis result (sources, findings, conclusion, meta)
+ * - Perception: selected signal, filter state, AI analysis output
+ */
+
+import React, { createContext, useContext, useState, useCallback, useRef } from 'react';
+
+// ── RagChat types ─────────────────────────────────────────────────────────────
+
+export interface RagMessage {
+ id: string;
+ role: 'user' | 'assistant';
+ text: string;
+ citationRefs?: number[];
+}
+
+export interface RagCitation {
+ index: number;
+ score: number;
+ name: string;
+ clause: string;
+ snippet: string;
+ docId?: string;
+}
+
+export interface RagChatState {
+ messages: RagMessage[];
+ citations: RagCitation[];
+ sessionId: string | null;
+ inputDraft: string;
+}
+
+const RAG_INIT: RagChatState = {
+ messages: [
+ {
+ id: 'init',
+ role: 'assistant',
+ text: 'Hello! I can answer questions about your indexed regulations and compliance documents. Try asking about EU AI Act requirements, MIIT rules, or ISO/SAE 21434 scope.',
+ },
+ ],
+ citations: [],
+ sessionId: null,
+ inputDraft: '',
+};
+
+// ── Compliance types ──────────────────────────────────────────────────────────
+
+export interface ComplianceSourceEvent {
+ standard: string;
+ clause: string;
+ score: number;
+ status: string;
+ full_content: string;
+}
+
+export interface ComplianceFindingEvent {
+ title: string;
+ desc: string;
+ status: 'ok' | 'warn' | 'risk';
+ clause_ref?: string;
+}
+
+export interface ComplianceActionItem {
+ label: string;
+ value: string;
+ risk?: boolean;
+}
+
+export interface ComplianceDonePayload {
+ conclusion: string;
+ actions: ComplianceActionItem[];
+ risk_score: number;
+ highlight_terms: string[];
+ para_text: string;
+}
+
+export interface ComplianceMeta {
+ title: string;
+ sourceType: 'text' | 'doc' | 'upload';
+ startedAt: string;
+}
+
+export type ComplianceStatus = 'idle' | 'streaming' | 'done' | 'error';
+
+export interface ComplianceState {
+ status: ComplianceStatus;
+ stageLabel: string;
+ stageKey: string;
+ meta: ComplianceMeta | null;
+ sources: ComplianceSourceEvent[];
+ findings: ComplianceFindingEvent[];
+ done: ComplianceDonePayload | null;
+ errorText: string;
+}
+
+const COMPLIANCE_INIT: ComplianceState = {
+ status: 'idle',
+ stageLabel: '',
+ stageKey: '',
+ meta: null,
+ sources: [],
+ findings: [],
+ done: null,
+ errorText: '',
+};
+
+// ── Perception types ──────────────────────────────────────────────────────────
+
+export interface PerceptionSignal {
+ id: string;
+ source: string;
+ standard: string;
+ status: 'ok' | 'warn' | 'risk' | 'info';
+ title: string;
+ summary: string;
+ date: string;
+ tags: string[];
+ impact: 'High' | 'Medium' | 'Low';
+}
+
+export interface PerceptionPageState {
+ signals: PerceptionSignal[];
+ searchQuery: string;
+ sourceFilter: string;
+ impactFilter: string;
+ selectedId: string | null;
+ aiOutput: string;
+ detailTab: 'overview' | 'obligations' | 'assessment' | 'diff';
+ crawlStatus: string;
+}
+
+const PERCEPTION_INIT: PerceptionPageState = {
+ signals: [],
+ searchQuery: '',
+ sourceFilter: 'All',
+ impactFilter: 'All',
+ selectedId: null,
+ aiOutput: '',
+ detailTab: 'overview',
+ crawlStatus: '',
+};
+
+// ── Context value ─────────────────────────────────────────────────────────────
+
+interface PageStateContextValue {
+ // RagChat
+ ragState: RagChatState;
+ setRagState: React.Dispatch>;
+ ragStreamingRef: React.MutableRefObject;
+ ragAbortRef: React.MutableRefObject;
+
+ // Compliance
+ complianceState: ComplianceState;
+ setComplianceState: React.Dispatch>;
+ complianceAbortRef: React.MutableRefObject;
+ resetCompliance: () => void;
+
+ // Perception
+ perceptionState: PerceptionPageState;
+ setPerceptionState: React.Dispatch>;
+ perceptionAbortRef: React.MutableRefObject;
+ perceptionCrawlAbortRef: React.MutableRefObject;
+}
+
+const PageStateContext = createContext(null);
+
+// ── Provider ──────────────────────────────────────────────────────────────────
+
+export function PageStateProvider({ children }: { children: React.ReactNode }) {
+ const [ragState, setRagState] = useState(RAG_INIT);
+ const ragStreamingRef = useRef(false);
+ const ragAbortRef = useRef(null);
+
+ const [complianceState, setComplianceState] = useState(COMPLIANCE_INIT);
+ const complianceAbortRef = useRef(null);
+
+ const resetCompliance = useCallback(() => {
+ complianceAbortRef.current?.abort();
+ setComplianceState(COMPLIANCE_INIT);
+ }, []);
+
+ const [perceptionState, setPerceptionState] = useState(PERCEPTION_INIT);
+ const perceptionAbortRef = useRef(null);
+ const perceptionCrawlAbortRef = useRef(null);
+
+ return (
+
+ {children}
+
+ );
+}
+
+// ── Hook ──────────────────────────────────────────────────────────────────────
+
+export function usePageState() {
+ const ctx = useContext(PageStateContext);
+ if (!ctx) throw new Error('usePageState must be used inside PageStateProvider');
+ return ctx;
+}
diff --git a/frontend/src/contexts/index.ts b/frontend/src/contexts/index.ts
index 5267afb..693c63f 100644
--- a/frontend/src/contexts/index.ts
+++ b/frontend/src/contexts/index.ts
@@ -1,3 +1,18 @@
export { ThemeProvider, useTheme } from './ThemeContext';
export { AuthProvider, useAuth } from './AuthContext';
export type { AuthUser } from './AuthContext';
+export { PageStateProvider, usePageState } from './PageStateContext';
+export type {
+ RagChatState,
+ RagMessage,
+ RagCitation,
+ ComplianceState,
+ ComplianceStatus,
+ ComplianceSourceEvent,
+ ComplianceFindingEvent,
+ ComplianceDonePayload,
+ ComplianceMeta,
+ ComplianceActionItem,
+ PerceptionPageState,
+ PerceptionSignal,
+} from './PageStateContext';
diff --git a/frontend/src/pages/Compliance/useComplianceAnalysis.ts b/frontend/src/pages/Compliance/useComplianceAnalysis.ts
index 312c43e..9a63d71 100644
--- a/frontend/src/pages/Compliance/useComplianceAnalysis.ts
+++ b/frontend/src/pages/Compliance/useComplianceAnalysis.ts
@@ -1,4 +1,25 @@
-import { useState, useCallback, useRef } from 'react';
+/**
+ * useComplianceAnalysis — compliance analysis state wired to PageStateContext.
+ *
+ * State is stored in the global context so it persists when the user navigates
+ * to another module and returns. The `run` and `reset` actions are identical
+ * to the previous hook API so CompliancePage needs no structural changes.
+ */
+
+import { useCallback } from 'react';
+import { usePageState } from '../../contexts';
+import type {
+ ComplianceMeta,
+ ComplianceState,
+ ComplianceSourceEvent,
+ ComplianceFindingEvent,
+ ComplianceDonePayload,
+} from '../../contexts';
+
+export type { ComplianceMeta, ComplianceState, ComplianceSourceEvent as SourceEvent, ComplianceFindingEvent as FindingEvent, ComplianceDonePayload as DonePayload };
+export type { ComplianceActionItem as ActionItem } from '../../contexts';
+export type AnalysisStatus = import('../../contexts').ComplianceStatus;
+export type AnalysisMeta = ComplianceMeta;
const TOKEN_KEY = 'auth_token';
function authHeader(): Record {
@@ -6,55 +27,7 @@ function authHeader(): Record {
return t ? { Authorization: `Bearer ${t}` } : {};
}
-export type AnalysisStatus = 'idle' | 'streaming' | 'done' | 'error';
-
-export interface SourceEvent {
- standard: string;
- clause: string;
- score: number;
- status: string;
- full_content: string;
-}
-
-export interface FindingEvent {
- title: string;
- desc: string;
- status: 'ok' | 'warn' | 'risk';
- clause_ref?: string;
-}
-
-export interface ActionItem {
- label: string;
- value: string;
- risk?: boolean;
-}
-
-export interface DonePayload {
- conclusion: string;
- actions: ActionItem[];
- risk_score: number;
- highlight_terms: string[];
- para_text: string;
-}
-
-export interface AnalysisMeta {
- title: string;
- sourceType: 'text' | 'doc' | 'upload';
- startedAt: string; // ISO timestamp
-}
-
-export interface AnalysisState {
- status: AnalysisStatus;
- stageLabel: string;
- stageKey: string;
- meta: AnalysisMeta | null;
- sources: SourceEvent[];
- findings: FindingEvent[];
- done: DonePayload | null;
- errorText: string;
-}
-
-const INITIAL_STATE: AnalysisState = {
+const INITIAL_STATE: ComplianceState = {
status: 'idle',
stageLabel: '',
stageKey: '',
@@ -66,18 +39,12 @@ const INITIAL_STATE: AnalysisState = {
};
export function useComplianceAnalysis() {
- const [state, setState] = useState(INITIAL_STATE);
- const abortRef = useRef(null);
+ const { complianceState: state, setComplianceState: setState, complianceAbortRef, resetCompliance: reset } = usePageState();
- const reset = useCallback(() => {
- abortRef.current?.abort();
- setState(INITIAL_STATE);
- }, []);
-
- const run = useCallback(async (formData: FormData, meta: AnalysisMeta) => {
- abortRef.current?.abort();
+ const run = useCallback(async (formData: FormData, meta: ComplianceMeta) => {
+ complianceAbortRef.current?.abort();
const ctrl = new AbortController();
- abortRef.current = ctrl;
+ complianceAbortRef.current = ctrl;
setState({ ...INITIAL_STATE, status: 'streaming', stageLabel: 'Starting…', meta });
@@ -124,7 +91,7 @@ export function useComplianceAnalysis() {
if (j.type === 'stage') {
setState(s => ({ ...s, stageLabel: j.label ?? '', stageKey: j.stage ?? '' }));
} else if (j.type === 'source') {
- const src: SourceEvent = {
+ const src: ComplianceSourceEvent = {
standard: j.standard ?? '',
clause: j.clause ?? '',
score: j.score ?? 0,
@@ -133,7 +100,7 @@ export function useComplianceAnalysis() {
};
setState(s => ({ ...s, sources: [...s.sources, src] }));
} else if (j.type === 'finding') {
- const finding: FindingEvent = {
+ const finding: ComplianceFindingEvent = {
title: j.title ?? '',
desc: j.desc ?? '',
status: j.status ?? 'info',
@@ -141,7 +108,7 @@ export function useComplianceAnalysis() {
};
setState(s => ({ ...s, findings: [...s.findings, finding] }));
} else if (j.type === 'done') {
- const payload: DonePayload = {
+ const payload: ComplianceDonePayload = {
conclusion: j.conclusion ?? '',
actions: j.actions ?? [],
risk_score: j.risk_score ?? 0,
@@ -162,7 +129,7 @@ export function useComplianceAnalysis() {
if (e instanceof Error && e.name === 'AbortError') return;
setState(s => ({ ...s, status: 'error', errorText: String(e) }));
}
- }, []);
+ }, [setState, complianceAbortRef]);
return { state, run, reset };
}
diff --git a/frontend/src/pages/Perception/PerceptionPage.tsx b/frontend/src/pages/Perception/PerceptionPage.tsx
index 7a342b3..226cfdc 100644
--- a/frontend/src/pages/Perception/PerceptionPage.tsx
+++ b/frontend/src/pages/Perception/PerceptionPage.tsx
@@ -1,6 +1,8 @@
import { useState, useEffect, useRef } from 'react';
import { Topbar } from '../../components/layout/Topbar';
import { RefreshCw, Play, Square, ExternalLink } from 'lucide-react';
+import { usePageState } from '../../contexts';
+import type { PerceptionSignal } from '../../contexts';
const TOKEN_KEY = 'auth_token';
function authHeader(): Record {
@@ -8,18 +10,6 @@ function authHeader(): Record {
return t ? { Authorization: `Bearer ${t}` } : {};
}
-interface Signal {
- id: string;
- source: string;
- standard: string;
- status: 'ok' | 'warn' | 'risk' | 'info';
- title: string;
- summary: string;
- date: string;
- tags: string[];
- impact: 'High' | 'Medium' | 'Low';
-}
-
interface Stats {
total: number;
high_impact: number;
@@ -27,29 +17,17 @@ interface Stats {
last_90_days: number;
}
-interface DocResult {
- score: number;
- name: string;
- clause: string;
- snippet: string;
-}
-
const SOURCES = ['All', 'MIIT', 'UN-ECE', 'ISO', 'GB Comm.', 'EUR-Lex', 'IATF'];
const IMPACTS = ['All', 'High', 'Medium', 'Low'];
-// Backend /api/v1/perception/stats returns:
-// { total, high_impact, medium_impact, last_90_days } — field names match, ✓
-
-// Backend /api/v1/perception/events returns:
-// { events: [{ id, title, summary, source, standard, impact_level, published_at, tags, status }] }
-// Map backend event fields → frontend Signal shape
-function mapEvent(e: Record): Signal {
+// Backend event → Signal
+function mapEvent(e: Record): PerceptionSignal {
const impact = String(e.impact_level ?? '').toLowerCase();
const backendStatus = String(e.status ?? '').toLowerCase();
return {
id: String(e.id ?? e.event_id ?? ''),
source: String(e.source ?? ''),
- standard: String(e.standard ?? e.regulation_id ?? ''),
+ standard: String(e.standard ?? e.standard_code ?? e.regulation_id ?? ''),
status: backendStatus === 'high' || backendStatus === 'urgent' ? 'risk'
: backendStatus === 'medium' || backendStatus === 'draft' ? 'warn'
: backendStatus === 'low' || backendStatus === 'final' ? 'ok'
@@ -62,50 +40,40 @@ function mapEvent(e: Record): Signal {
};
}
-const MOCK_SIGNALS: Signal[] = [
+const MOCK_SIGNALS: PerceptionSignal[] = [
{
id: '1', source: 'EUR-Lex', standard: 'EU/2024/1689', status: 'risk',
title: 'EU AI Act — High-risk AI in vehicles',
summary: 'Article 9 mandates risk management systems for automotive AI classifying as high-risk under Annex III point 3.',
- date: '2025-11-18', tags: ['automotive', 'GDPR', 'certification'], impact: 'High'
+ date: '2025-11-18', tags: ['automotive', 'GDPR', 'certification'], impact: 'High',
},
{
id: '2', source: 'MIIT', standard: 'Draft-2025-08', status: 'warn',
title: 'MIIT Draft — in-vehicle AI training data',
summary: 'Draft regulation requires OEM data provenance documentation and OTA audit trails for AI systems.',
- date: '2025-10-30', tags: ['OTA', 'data-governance', 'China'], impact: 'High'
+ date: '2025-10-30', tags: ['OTA', 'data-governance', 'China'], impact: 'High',
},
{
id: '3', source: 'ISO', standard: 'ISO/SAE 21434:2021/Amd1', status: 'info',
title: 'ISO/SAE 21434 Amendment 1',
summary: 'Amendment clarifies CSMS scope for software-only updates and vulnerability disclosure timelines.',
- date: '2025-10-05', tags: ['cybersecurity', 'CSMS', 'ISO'], impact: 'Medium'
+ date: '2025-10-05', tags: ['cybersecurity', 'CSMS', 'ISO'], impact: 'Medium',
},
- {
- id: '4', source: 'UN-ECE', standard: 'UNECE WP.29 R155', status: 'ok',
- title: 'UNECE R155 Corrigendum',
- summary: 'Editorial corrections to cybersecurity management system requirements. No substantive changes.',
- date: '2025-09-12', tags: ['type-approval', 'UNECE'], impact: 'Low'
- },
-];
-
-const MOCK_DOCS: DocResult[] = [
- { score: 94, name: 'Vehicle AI Safety Manual v3.2', clause: '§4.2.1', snippet: 'The risk management process shall identify and evaluate risks arising from AI system decisions in safety-critical scenarios...' },
- { score: 87, name: 'ADAS System Requirements', clause: '§7.1', snippet: 'Automated driving functions must document training data lineage and model performance envelopes prior to deployment.' },
- { score: 71, name: 'Type Approval Documentation', clause: 'Annex B', snippet: 'Cybersecurity management system certification requires third-party audit of AI decision audit logs retention policy.' },
];
export function PerceptionPage() {
- const [stats, setStats] = useState(null);
- const [signals, setSignals] = useState(MOCK_SIGNALS);
- const [searchQuery, setSearchQuery] = useState('');
- const [sourceFilter, setSourceFilter] = useState('All');
- const [impactFilter, setImpactFilter] = useState('All');
- const [selected, setSelected] = useState(null);
- const [streaming, setStreaming] = useState(false);
- const [aiOutput, setAiOutput] = useState('');
- const abortRef = useRef(null);
+ // Persistent state lives in PageStateContext — survives route changes
+ const { perceptionState, setPerceptionState, perceptionAbortRef, perceptionCrawlAbortRef } = usePageState();
+ const { signals, searchQuery, sourceFilter, impactFilter, selectedId, aiOutput, detailTab, crawlStatus } = perceptionState;
+ // Stats and selectedFull are lightweight to re-fetch on mount
+ const [stats, setStats] = useState(null);
+ const [streaming, setStreaming] = useState(false);
+ const [crawling, setCrawling] = useState(false);
+ // Full event detail — re-fetched when selected changes or page mounts with a selection
+ const [selectedFull, setSelectedFull] = useState | null>(null);
+
+ // Re-fetch stats every time the page mounts
useEffect(() => {
fetch('/api/v1/perception/stats', { headers: authHeader() })
.then(r => r.json())
@@ -113,16 +81,36 @@ export function PerceptionPage() {
.catch(() => setStats({ total: 47, high_impact: 7, medium_impact: 18, last_90_days: 14 }));
}, []);
+ // Fetch signal list on first mount only (if empty), otherwise preserve context state
useEffect(() => {
+ if (signals.length > 0) return; // already loaded
fetch('/api/v1/perception/events?limit=100', { headers: authHeader() })
.then(r => r.json())
.then(d => {
if (Array.isArray(d?.events) && d.events.length > 0) {
- setSignals(d.events.map(mapEvent));
+ setPerceptionState(s => ({ ...s, signals: d.events.map(mapEvent) }));
+ } else {
+ setPerceptionState(s => ({ ...s, signals: MOCK_SIGNALS }));
}
})
- .catch(() => { /* keep mock data on error */ });
- }, []);
+ .catch(() => {
+ setPerceptionState(s => ({ ...s, signals: s.signals.length > 0 ? s.signals : MOCK_SIGNALS }));
+ });
+ }, []); // eslint-disable-line react-hooks/exhaustive-deps
+
+ // Re-fetch full event detail when navigating back with a selected signal
+ useEffect(() => {
+ if (selectedId) {
+ fetch(`/api/v1/perception/events/${selectedId}`, { headers: authHeader() })
+ .then(r => r.ok ? r.json() : null)
+ .then(d => { if (d) setSelectedFull(d); })
+ .catch(() => {});
+ } else {
+ setSelectedFull(null);
+ }
+ }, [selectedId]);
+
+ const selected = signals.find(s => s.id === selectedId) ?? null;
const filtered = signals.filter(s => {
if (sourceFilter !== 'All' && s.source !== sourceFilter) return false;
@@ -137,13 +125,20 @@ export function PerceptionPage() {
function runAnalysis() {
if (!selected) return;
setStreaming(true);
- setAiOutput('');
+ setPerceptionState(s => ({ ...s, aiOutput: '' }));
const ctrl = new AbortController();
- abortRef.current = ctrl;
- // Backend: POST /api/v1/perception/events/{id}/analyze → SSE stream
- fetch(`/api/v1/perception/events/${selected.id}/analyze`, { method: 'POST', headers: authHeader(), signal: ctrl.signal })
+ perceptionAbortRef.current = ctrl;
+ fetch(`/api/v1/perception/events/${selected.id}/analyze`, {
+ method: 'POST',
+ headers: authHeader(),
+ signal: ctrl.signal,
+ })
.then(async res => {
- if (!res.body) { setAiOutput('No stream available.'); setStreaming(false); return; }
+ if (!res.body) {
+ setPerceptionState(s => ({ ...s, aiOutput: 'No stream available.' }));
+ setStreaming(false);
+ return;
+ }
const reader = res.body.getReader();
const dec = new TextDecoder();
let buf = '';
@@ -160,30 +155,99 @@ export function PerceptionPage() {
if (!raw || raw === '[DONE]') continue;
try {
const j = JSON.parse(raw);
- if (j.text) setAiOutput(p => p + j.text);
- else if (typeof j === 'string') setAiOutput(p => p + j);
+ if (j.text) setPerceptionState(s => ({ ...s, aiOutput: s.aiOutput + j.text }));
+ else if (typeof j === 'string') setPerceptionState(s => ({ ...s, aiOutput: s.aiOutput + j }));
} catch {
- setAiOutput(p => p + raw);
+ setPerceptionState(s => ({ ...s, aiOutput: s.aiOutput + raw }));
}
}
}
setStreaming(false);
})
.catch(e => {
- if (e.name !== 'AbortError') setAiOutput('Analysis failed. Check API connection.');
+ if (e.name !== 'AbortError') setPerceptionState(s => ({ ...s, aiOutput: 'Analysis failed. Check API connection.' }));
setStreaming(false);
});
}
function stopAnalysis() {
- abortRef.current?.abort();
+ perceptionAbortRef.current?.abort();
setStreaming(false);
}
- function selectSignal(sig: Signal) {
- setSelected(sig);
- setAiOutput('');
+ async function runCrawl() {
+ setCrawling(true);
+ setPerceptionState(s => ({ ...s, crawlStatus: '正在连接数据源...' }));
+ try {
+ const res = await fetch('/api/v1/perception/crawl', {
+ method: 'POST',
+ headers: { 'Content-Type': 'application/json', ...authHeader() },
+ body: JSON.stringify({}),
+ });
+ if (!res.body) {
+ setPerceptionState(s => ({ ...s, crawlStatus: 'No stream' }));
+ setCrawling(false);
+ return;
+ }
+ const reader = res.body.getReader();
+ const dec = new TextDecoder();
+ let buf = '';
+ while (true) {
+ const { done, value } = await reader.read();
+ if (done) break;
+ buf += dec.decode(value);
+ const parts = buf.split('\n\n');
+ buf = parts.pop() ?? '';
+ for (const block of parts) {
+ const eventLine = block.split('\n').find(l => l.startsWith('event: '));
+ const dataLine = block.split('\n').find(l => l.startsWith('data: '));
+ const evtName = eventLine?.slice(7).trim();
+ const raw = dataLine?.slice(6).trim();
+ if (!raw) continue;
+ try {
+ const d = JSON.parse(raw);
+ if (evtName === 'progress') {
+ setPerceptionState(s => ({
+ ...s,
+ crawlStatus: `${d.source}: ${d.stage === 'fetching' ? '抓取中...' : d.stage === 'processing' ? `处理 ${d.fetched} 条...` : `完成 +${d.new} 条`}`,
+ }));
+ } else if (evtName === 'done') {
+ setPerceptionState(s => ({ ...s, crawlStatus: `更新完成 — 新增 ${d.total_new} 条,更新 ${d.total_updated} 条` }));
+ fetch('/api/v1/perception/events?limit=100', { headers: authHeader() })
+ .then(r => r.json())
+ .then(d2 => {
+ if (Array.isArray(d2?.events)) {
+ setPerceptionState(s => ({ ...s, signals: d2.events.map(mapEvent) }));
+ }
+ });
+ } else if (evtName === 'error') {
+ setPerceptionState(s => ({
+ ...s,
+ crawlStatus: `错误: ${typeof d === 'string' ? d : d.message}`,
+ }));
+ }
+ } catch { /* ignore */ }
+ }
+ }
+ } catch (e: unknown) {
+ setPerceptionState(s => ({
+ ...s,
+ crawlStatus: `连接失败: ${e instanceof Error ? e.message : String(e)}`,
+ }));
+ }
+ setCrawling(false);
+ }
+
+ function selectSignal(sig: PerceptionSignal) {
+ setPerceptionState(s => ({
+ ...s,
+ selectedId: sig.id,
+ aiOutput: '',
+ detailTab: 'overview',
+ }));
+ setSelectedFull(null);
setStreaming(false);
+ perceptionAbortRef.current?.abort();
}
return (
@@ -197,10 +261,18 @@ export function PerceptionPage() {
setSearchQuery(e.target.value)}
+ onChange={e => setPerceptionState(s => ({ ...s, searchQuery: e.target.value }))}
/>
-
+
+ {crawlStatus && (
+
+ {crawlStatus}
+
+ )}
>
}
/>
@@ -227,13 +299,25 @@ export function PerceptionPage() {
{SOURCES.map(s => (
-
+
))}
{IMPACTS.map(i => (
-
+
))}
@@ -243,7 +327,7 @@ export function PerceptionPage() {
{filtered.map(sig => (
selectSignal(sig)}
>
@@ -277,8 +361,11 @@ export function PerceptionPage() {
{selected.source}
{selected.standard}
- {selected.status === 'risk' ? 'Urgent' : 'Published'}
+ {selected.status === 'risk' ? 'Urgent' : selected.status === 'warn' ? 'Draft' : 'Published'}
+ {selectedFull?.change_summary && (
+ CHANGED
+ )}
{selected.title}
{selected.summary}
@@ -287,23 +374,160 @@ export function PerceptionPage() {
?
:
}
-
+ {selected && (
+
+ Source
+
+ )}
-
-
Affected documents
- {MOCK_DOCS.map(d => (
-
-
{d.score}%
-
-
{d.name} {d.clause}
-
{d.snippet}
-
-
+
+ {(['overview', 'obligations', 'assessment', 'diff'] as const).map(tab => (
+
))}
+ {detailTab === 'overview' && (
+
+
Scope & Summary
+
+ {(selectedFull?.scope as string) || selected.summary}
+
+ {selectedFull?.penalties && (
+
+ ⚠ {selectedFull.penalties as string}
+
+ )}
+
+ )}
+
+ {detailTab === 'obligations' && (
+
+
义务条款
+ {(() => {
+ const obs = (selectedFull?.obligations as Array
>) || [];
+ const deadlines = (selectedFull?.deadlines as Array>) || [];
+ return obs.length === 0 && deadlines.length === 0 ? (
+ 暂无结构化数据。点击右上角"Run impact analysis"触发提取。
+ ) : (
+ <>
+ {obs.length > 0 && (
+
+
+
+ | 义务描述 |
+ 主体 |
+ 类型 |
+
+
+
+ {obs.map((ob, i) => (
+
+ | {ob.text} |
+ {ob.subject} |
+
+
+ {ob.deontic}
+
+ |
+
+ ))}
+
+
+ )}
+ {deadlines.length > 0 && (
+
+
截止日期
+ {deadlines.map((d, i) => (
+
+ {d.date || '待定'}
+ {d.description}
+
+ ))}
+
+ )}
+ >
+ );
+ })()}
+
+ )}
+
+ {detailTab === 'assessment' && (
+
+
Affected documents
+ {(() => {
+ const docs = (selectedFull?.affected_docs as Array
>);
+ const displayDocs = docs && docs.length > 0 ? docs : [];
+ return displayDocs.length === 0
+ ? No affected documents found.
+ : displayDocs.map((d, i) => (
+
+
{Math.round(Number(d.score ?? 0) * 100)}%
+
+
+ {String(d.doc_name || '')}
+ {String(d.key_clauses || d.clause || '')}
+
+ {d.snippet &&
{String(d.snippet)}
}
+ {d.recommendation && (
+
→ {String(d.recommendation)}
+ )}
+
+
+ ));
+ })()}
+
+ )}
+
+ {detailTab === 'diff' && selectedFull?.change_summary && (
+
+
变更对比
+
+ {selectedFull.change_summary as string}
+
+ {(() => {
+ const sections = (selectedFull.changed_sections as Array
>) || [];
+ return sections.map((s, i) => (
+
+
+
+ {String(s.change_type)}
+
+ cosine: {String(s.similarity)}
+
+
+
+
旧版
+ {String(s.old_text || '')}
+
+
+
新版
+ {String(s.new_text || '')}
+
+
+ {s.summary &&
{String(s.summary)}
}
+
+ ));
+ })()}
+
+ )}
+
{(aiOutput || streaming) && (
AI Impact Analysis
diff --git a/frontend/src/pages/RagChat/RagChatPage.tsx b/frontend/src/pages/RagChat/RagChatPage.tsx
index ad508d4..c3a5681 100644
--- a/frontend/src/pages/RagChat/RagChatPage.tsx
+++ b/frontend/src/pages/RagChat/RagChatPage.tsx
@@ -1,6 +1,8 @@
-import { useState, useRef, useEffect, useCallback } from 'react';
+import { useRef, useEffect, useCallback, useState } from 'react';
import { Topbar } from '../../components/layout/Topbar';
import { Send, Download } from 'lucide-react';
+import { usePageState } from '../../contexts';
+import type { RagCitation } from '../../contexts';
const TOKEN_KEY = 'auth_token';
function authHeader(): Record
{
@@ -8,26 +10,8 @@ function authHeader(): Record {
return t ? { Authorization: `Bearer ${t}` } : {};
}
-interface Message {
- id: string;
- role: 'user' | 'assistant';
- text: string;
- // citation indices mentioned in this assistant message (1-based, matching citations array)
- citationRefs?: number[];
-}
-
-interface Citation {
- index: number; // 1-based, matches [N] markers in text
- score: number; // 0–100 display percentage
- name: string; // doc_name
- clause: string; // section_title or clause
- snippet: string; // preview text
- docId?: string;
-}
-
// Map a raw source doc from the backend "retrieved" event to our Citation shape.
-// Backend fields: { id, score(0-1), preview, doc_name, clause, doc_id }
-function mapSource(s: Record, idx: number): Citation {
+function mapSource(s: Record, idx: number): RagCitation {
const rawScore = typeof s.score === 'number' ? s.score : 0;
const displayScore = rawScore <= 1 ? Math.round(rawScore * 100) : Math.round(rawScore);
return {
@@ -73,25 +57,21 @@ const MOCK_QUICK = [
];
export function RagChatPage() {
- const [messages, setMessages] = useState([
- {
- id: 'init', role: 'assistant',
- text: 'Hello! I can answer questions about your indexed regulations and compliance documents. Try asking about EU AI Act requirements, MIIT rules, or ISO/SAE 21434 scope.',
- }
- ]);
- const [quickPrompts, setQuickPrompts] = useState(MOCK_QUICK);
- const [input, setInput] = useState('');
- const [streaming, setStreaming] = useState(false);
- const [citations, setCitations] = useState([]);
+ // All persistent state lives in PageStateContext — survives route changes
+ const { ragState, setRagState, ragStreamingRef, ragAbortRef } = usePageState();
+ const { messages, citations, sessionId, inputDraft } = ragState;
+
+ // Local-only UI state: highlighted citation and streaming indicator
+ // These are fine to reset on navigation since they're transient UI feedback
const [highlightedCit, setHighlightedCit] = useState(null);
- const [sessionId, setSessionId] = useState(null);
+ const [streaming, setStreaming] = useState(ragStreamingRef.current);
+ const [quickPrompts, setQuickPrompts] = useState(MOCK_QUICK);
const bottomRef = useRef(null);
const citRailRef = useRef(null);
const citItemRefs = useRef>({});
- const abortRef = useRef(null);
- // Fetch quick questions from backend on mount
+ // Fetch quick questions from backend on mount (only once per session)
useEffect(() => {
fetch('/api/v1/rag/quick-questions', { headers: authHeader() })
.then(r => r.json())
@@ -115,26 +95,33 @@ export function RagChatPage() {
if (el) {
el.scrollIntoView({ behavior: 'smooth', block: 'nearest' });
}
- // Clear highlight after 3s
setTimeout(() => setHighlightedCit(h => h === n ? null : h), 3000);
}, []);
async function send(text?: string) {
- const q = (text ?? input).trim();
- if (!q || streaming) return;
- setInput('');
-
- const userMsg: Message = { id: Date.now().toString(), role: 'user', text: q };
- setMessages(m => [...m, userMsg]);
+ const q = (text ?? inputDraft).trim();
+ if (!q || ragStreamingRef.current) return;
+ setRagState(s => ({ ...s, inputDraft: '' }));
+ const userMsgId = Date.now().toString();
const assistantId = (Date.now() + 1).toString();
- setMessages(m => [...m, { id: assistantId, role: 'assistant', text: '' }]);
+
+ setRagState(s => ({
+ ...s,
+ messages: [
+ ...s.messages,
+ { id: userMsgId, role: 'user', text: q },
+ { id: assistantId, role: 'assistant', text: '' },
+ ],
+ citations: [],
+ }));
+
+ ragStreamingRef.current = true;
setStreaming(true);
- setCitations([]);
setHighlightedCit(null);
const ctrl = new AbortController();
- abortRef.current = ctrl;
+ ragAbortRef.current = ctrl;
try {
const body: Record = { query: q, top_k: 5 };
@@ -151,14 +138,13 @@ export function RagChatPage() {
const reader = res.body.getReader();
const dec = new TextDecoder();
let buffer = '';
- const newCitations: Citation[] = [];
+ const newCitations: RagCitation[] = [];
while (true) {
const { done, value } = await reader.read();
if (done) break;
buffer += dec.decode(value, { stream: true });
- // SSE blocks separated by double newline
const blocks = buffer.split('\n\n');
buffer = blocks.pop() ?? '';
@@ -171,56 +157,62 @@ export function RagChatPage() {
const j = JSON.parse(raw);
if (j.type === 'session') {
- // Backend assigned a session_id — persist for next request
- if (j.session_id) setSessionId(j.session_id);
+ if (j.session_id) setRagState(s => ({ ...s, sessionId: j.session_id }));
} else if (j.type === 'retrieved' && Array.isArray(j.docs)) {
- // Sources arrive before the answer starts
const mapped = j.docs.map((d: Record, i: number) => mapSource(d, i + 1));
newCitations.push(...mapped);
- setCitations([...mapped]);
+ setRagState(s => ({ ...s, citations: [...mapped] }));
} else if (j.type === 'chunk' && j.text) {
- setMessages(m => m.map(msg =>
- msg.id === assistantId
- ? { ...msg, text: msg.text + (j.text as string) }
- : msg
- ));
-
- } else if (j.type === 'status') {
- // Status message (e.g. "找到N条相关法规…") — could show in UI if desired
- // For now we ignore it to keep the bubble clean
+ setRagState(s => ({
+ ...s,
+ messages: s.messages.map(msg =>
+ msg.id === assistantId
+ ? { ...msg, text: msg.text + (j.text as string) }
+ : msg
+ ),
+ }));
} else if (j.type === 'done') {
- // Extract which citation numbers appear in the final answer
- setMessages(m => m.map(msg => {
- if (msg.id !== assistantId) return msg;
- const refs = [...new Set(
- [...msg.text.matchAll(/\[(\d+)\]/g)].map(r => parseInt(r[1], 10))
- )].filter(n => n >= 1 && n <= newCitations.length);
- return { ...msg, citationRefs: refs };
+ setRagState(s => ({
+ ...s,
+ messages: s.messages.map(msg => {
+ if (msg.id !== assistantId) return msg;
+ const refs = [...new Set(
+ [...msg.text.matchAll(/\[(\d+)\]/g)].map(r => parseInt(r[1], 10))
+ )].filter(n => n >= 1 && n <= newCitations.length);
+ return { ...msg, citationRefs: refs };
+ }),
}));
break;
} else if (j.type === 'error') {
- setMessages(m => m.map(msg =>
- msg.id === assistantId
- ? { ...msg, text: `Error: ${j.text ?? 'Unknown error'}` }
- : msg
- ));
+ setRagState(s => ({
+ ...s,
+ messages: s.messages.map(msg =>
+ msg.id === assistantId
+ ? { ...msg, text: `Error: ${j.text ?? 'Unknown error'}` }
+ : msg
+ ),
+ }));
}
} catch { /* malformed JSON chunk, skip */ }
}
}
} catch (e: unknown) {
if (e instanceof Error && e.name !== 'AbortError') {
- setMessages(m => m.map(msg =>
- msg.id === assistantId
- ? { ...msg, text: 'Could not reach the RAG API. Please check the backend.' }
- : msg
- ));
+ setRagState(s => ({
+ ...s,
+ messages: s.messages.map(msg =>
+ msg.id === assistantId
+ ? { ...msg, text: 'Could not reach the RAG API. Please check the backend.' }
+ : msg
+ ),
+ }));
}
} finally {
+ ragStreamingRef.current = false;
setStreaming(false);
}
}
@@ -291,15 +283,15 @@ export function RagChatPage() {