fix somethings

2026-06-08 11:16:28 +08:00
parent 9fea9c6a53
commit e7963b267e
34 changed files with 5195 additions and 246 deletions
--- a/backend/app/api/routes/perception.py
+++ b/backend/app/api/routes/perception.py
@@ -4,10 +4,12 @@ from __future__ import annotations

 import json

-from fastapi import APIRouter, Query
+from fastapi import APIRouter, Depends, Query
 from fastapi.responses import StreamingResponse

-from app.shared.bootstrap import get_perception_service
+from app.shared.bootstrap import get_crawl_service, get_event_store, get_perception_service
+from app.api.dependencies.auth import get_current_user
+from app.domain.auth.models import UserClaims
 from app.shared.async_utils import iter_in_thread

 router = APIRouter(prefix="/perception", tags=["智能感知"])
@@ -65,3 +67,77 @@ async def analyze_event(event_id: str):
            "X-Accel-Buffering": "no",
        },
    )
+
+
+@router.post("/crawl")
+async def run_crawl(
+    body: dict = None,
+    current_user: UserClaims = Depends(get_current_user),
+):
+    """Trigger manual crawl of regulatory sources. Streams SSE progress.
+
+    Body (optional): {"sources": ["CATARC", "国标委·强制性", "EUR-Lex"]}
+    Omit sources to crawl all registered sources.
+    """
+    sources: list[str] | None = (body or {}).get("sources")
+    crawl_svc = get_crawl_service()
+
+    async def crawl_stream():
+        async for item in iter_in_thread(crawl_svc.run_crawl(sources=sources)):
+            event_name = item.get("event", "message")
+            data = item.get("data", "")
+            if isinstance(data, (dict, list)):
+                data = json.dumps(data, ensure_ascii=False)
+            yield f"event: {event_name}\ndata: {data}\n\n"
+
+    return StreamingResponse(
+        crawl_stream(),
+        media_type="text/event-stream",
+        headers={"Cache-Control": "no-cache", "X-Accel-Buffering": "no"},
+    )
+
+
+@router.post("/events/{event_id}/process")
+async def process_event(
+    event_id: str,
+    current_user: UserClaims = Depends(get_current_user),
+):
+    """Trigger LLM pipeline (extract + assess + diff) for a single event."""
+    from datetime import UTC, datetime
+    from app.infrastructure.perception.llm_pipeline import LlmPipeline
+    from app.shared.bootstrap import get_retrieval_service
+
+    event = get_perception_service().get_event(event_id)
+    if not event:
+        from fastapi import HTTPException
+        raise HTTPException(status_code=404, detail=f"Event {event_id} not found")
+
+    store = get_event_store()
+    pipeline = LlmPipeline()
+
+    structure = pipeline.extract_structure(event)
+    event.update(structure)
+    event["affected_docs"] = pipeline.assess_impact(event, get_retrieval_service())
+    event["processed_at"] = datetime.now(UTC).isoformat()
+    store.upsert(event)
+
+    return {"status": "ok", "event_id": event_id, "processed_at": event["processed_at"]}
+
+
+@router.get("/events/{event_id}/diff")
+async def get_event_diff(event_id: str):
+    """Return semantic diff detail for an event (only available if previously crawled twice)."""
+    event = get_perception_service().get_event(event_id)
+    if not event:
+        from fastapi import HTTPException
+        raise HTTPException(status_code=404, detail=f"Event {event_id} not found")
+    if not event.get("change_summary"):
+        from fastapi import HTTPException
+        raise HTTPException(status_code=404, detail="No diff available for this event")
+    return {
+        "event_id": event_id,
+        "change_summary": event.get("change_summary"),
+        "changed_sections": event.get("changed_sections") or [],
+        "previous_hash": event.get("previous_hash"),
+        "content_hash": event.get("content_hash"),
+    }
--- a/backend/app/application/perception/crawl_service.py
+++ b/backend/app/application/perception/crawl_service.py
@@ -0,0 +1,147 @@
+"""Orchestrates regulatory source crawlers and LLM enrichment pipeline."""
+
+from __future__ import annotations
+
+import hashlib
+from typing import Any, Generator
+
+from loguru import logger
+
+from app.infrastructure.perception.base_event_store import BaseEventStore
+from app.infrastructure.perception.crawlers.base import BaseCrawler, RawEvent
+from app.infrastructure.perception.llm_pipeline import LlmPipeline
+
+
+def _event_id(source: str, standard_code: str) -> str:
+    """Deterministic 12-char ID from source + standard_code."""
+    return hashlib.sha256(f"{source}-{standard_code}".encode()).hexdigest()[:12]
+
+
+def _content_hash(raw_text: str) -> str:
+    return hashlib.sha256(raw_text.encode()).hexdigest()
+
+
+def _raw_to_dict(raw: RawEvent, event_id: str, content_hash: str) -> dict:
+    return {
+        "id": event_id,
+        "source": raw.source,
+        "source_label": raw.source_label,
+        "standard_code": raw.standard_code,
+        "title": raw.title,
+        "summary": raw.summary,
+        "full_text_url": raw.full_text_url,
+        "status": raw.status,
+        "impact_level": "medium",
+        "published_at": raw.published_at,
+        "effective_at": raw.effective_at,
+        "category": raw.category,
+        "tags": raw.tags,
+        "content_hash": content_hash,
+        "previous_hash": None,
+    }
+
+
+class CrawlService:
+    """Orchestrate crawlers, hash-based change detection, and LLM enrichment."""
+
+    def __init__(
+        self,
+        crawlers: dict[str, BaseCrawler],
+        event_store: BaseEventStore,
+        llm_pipeline: LlmPipeline,
+        retrieval_service: Any,
+    ) -> None:
+        self._crawlers = crawlers
+        self._store = event_store
+        self._pipeline = llm_pipeline
+        self._retrieval = retrieval_service
+
+    def run_crawl(
+        self, sources: list[str] | None = None
+    ) -> Generator[dict, None, None]:
+        """Run crawl for selected sources. Yields SSE-ready progress dicts."""
+        targets = sources or list(self._crawlers.keys())
+        total_new = 0
+        total_updated = 0
+
+        for source_key in targets:
+            crawler = self._crawlers.get(source_key)
+            if not crawler:
+                yield {"event": "error", "data": f"Unknown source: {source_key}"}
+                continue
+
+            yield {"event": "progress", "data": {"source": source_key, "stage": "fetching"}}
+            try:
+                raw_events = crawler.fetch(limit=100)
+            except Exception as exc:
+                logger.exception("Crawler failed source={}", source_key)
+                yield {"event": "error", "data": {"source": source_key, "message": str(exc)}}
+                continue
+
+            yield {
+                "event": "progress",
+                "data": {"source": source_key, "stage": "processing", "fetched": len(raw_events)},
+            }
+
+            new_count = 0
+            updated_count = 0
+
+            for raw in raw_events:
+                eid = _event_id(raw.source, raw.standard_code)
+                new_hash = _content_hash(raw.raw_text or raw.title)
+                existing = self._store.get(eid)
+
+                if existing and existing.get("content_hash") == new_hash:
+                    continue
+
+                is_update = existing is not None
+                old_text = existing.get("summary", "") if is_update else ""
+                previous_hash = existing.get("content_hash") if is_update else None
+
+                event_dict = _raw_to_dict(raw, eid, new_hash)
+                event_dict["previous_hash"] = previous_hash
+
+                try:
+                    structure = self._pipeline.extract_structure(event_dict)
+                    event_dict.update(structure)
+                except Exception as exc:
+                    logger.warning("Structure extraction failed id={} err={}", eid, exc)
+
+                try:
+                    affected = self._pipeline.assess_impact(event_dict, self._retrieval)
+                    event_dict["affected_docs"] = affected
+                except Exception as exc:
+                    logger.warning("Impact assessment failed id={} err={}", eid, exc)
+
+                if is_update and old_text and raw.raw_text:
+                    try:
+                        diff = self._pipeline.compute_diff(old_text, raw.raw_text)
+                        event_dict["change_summary"] = diff.get("change_summary")
+                        event_dict["changed_sections"] = diff.get("changed_sections")
+                    except Exception as exc:
+                        logger.warning("Diff failed id={} err={}", eid, exc)
+
+                self._store.upsert(event_dict)
+
+                if is_update:
+                    updated_count += 1
+                else:
+                    new_count += 1
+
+            total_new += new_count
+            total_updated += updated_count
+
+            yield {
+                "event": "progress",
+                "data": {
+                    "source": source_key,
+                    "stage": "done",
+                    "new": new_count,
+                    "updated": updated_count,
+                },
+            }
+
+        yield {
+            "event": "done",
+            "data": {"total_new": total_new, "total_updated": total_updated},
+        }
--- a/backend/app/application/perception/services.py
+++ b/backend/app/application/perception/services.py
@@ -6,7 +6,7 @@ import json
 from typing import Generator

 from app.application.knowledge.services import KnowledgeRetrievalService
-from app.infrastructure.perception.mock_event_store import MockEventStore
+from app.infrastructure.perception.base_event_store import BaseEventStore
 from app.services.llm.llm_factory import get_llm_client
 from app.config.settings import settings

@@ -22,7 +22,7 @@ class PerceptionService:

    def __init__(
        self,
-        event_store: MockEventStore,
+        event_store: BaseEventStore,
        retrieval_service: KnowledgeRetrievalService,
    ) -> None:
        self._store = event_store
--- a/backend/app/config/settings.py
+++ b/backend/app/config/settings.py
@@ -87,6 +87,18 @@ class Settings(BaseSettings):
    # no external worker needed. Switch to True only when a Celery worker is running.
    use_celery_worker: bool = Field(default=False, description="使用 Celery Worker 异步处理文档 (需要 Worker 运行中)")

+    # ── Perception crawl ──────────────────────────────────────────────────────
+    perception_crawl_timeout_seconds: int = Field(
+        default=120, description="HTTP timeout for regulatory source crawlers."
+    )
+    perception_max_events_per_source: int = Field(
+        default=100, description="Maximum events fetched per source per crawl run."
+    )
+    perception_diff_similarity_threshold: float = Field(
+        default=0.85,
+        description="Cosine similarity below which a paragraph is flagged as changed.",
+    )
+
    # Keep configuration setup explicit so runtime behavior is easy to reason about.
    api_host: str = Field(default="0.0.0.0", description="API服务地址")
    api_port: int = Field(default=8000, description="API服务端口")
--- a/backend/app/infrastructure/perception/base_event_store.py
+++ b/backend/app/infrastructure/perception/base_event_store.py
@@ -0,0 +1,39 @@
+"""Abstract base class for regulatory event stores."""
+
+from __future__ import annotations
+
+from abc import ABC, abstractmethod
+
+
+class BaseEventStore(ABC):
+    """Port interface for regulatory event persistence."""
+
+    @abstractmethod
+    def all(self) -> list[dict]:
+        """Return all events, most-recent first."""
+
+    @abstractmethod
+    def get(self, event_id: str) -> dict | None:
+        """Return a single event by ID, or None."""
+
+    @abstractmethod
+    def filter(
+        self,
+        *,
+        source: str | None = None,
+        impact_level: str | None = None,
+        limit: int = 50,
+    ) -> list[dict]:
+        """Return filtered events sorted by published_at descending."""
+
+    @abstractmethod
+    def stats(self) -> dict:
+        """Return {total, high_impact, medium_impact, low_impact, recent_90d}."""
+
+    @abstractmethod
+    def upsert(self, event: dict) -> None:
+        """Insert or update an event record."""
+
+    @abstractmethod
+    def get_by_standard_code(self, standard_code: str) -> dict | None:
+        """Return the most-recent event with matching standard_code, or None."""
--- a/backend/app/infrastructure/perception/crawlers/init.py
+++ b/backend/app/infrastructure/perception/crawlers/init.py
--- a/backend/app/infrastructure/perception/crawlers/_utils.py
+++ b/backend/app/infrastructure/perception/crawlers/_utils.py
@@ -0,0 +1,43 @@
+"""Shared utility functions for crawlers."""
+
+from __future__ import annotations
+
+import re
+from datetime import date
+
+
+def parse_date(text: str) -> str:
+    """Return YYYY-MM-DD from common Chinese date formats, or today's date."""
+    text = text.strip()
+    if not text:
+        return date.today().isoformat()
+    m = re.search(r"(\d{4})[/-](\d{1,2})[/-](\d{1,2})", text)
+    if m:
+        try:
+            return date(int(m.group(1)), int(m.group(2)), int(m.group(3))).isoformat()
+        except ValueError:
+            pass
+    m2 = re.search(r"(\d{4})年(\d{1,2})月(\d{1,2})日?", text)
+    if m2:
+        try:
+            return date(int(m2.group(1)), int(m2.group(2)), int(m2.group(3))).isoformat()
+        except ValueError:
+            pass
+    return date.today().isoformat()
+
+
+def extract_tags(standard_code: str, title: str) -> list[str]:
+    """Derive simple keyword tags from standard code and title."""
+    tags: list[str] = []
+    code_upper = standard_code.upper()
+    if "GB" in code_upper:
+        tags.append("国家标准")
+    if "/T" in code_upper:
+        tags.append("推荐性")
+    else:
+        tags.append("强制性")
+    keywords = ["电动", "安全", "自动驾驶", "充电", "智能网联", "碰撞", "排放", "网络安全"]
+    for kw in keywords:
+        if kw in title:
+            tags.append(kw)
+    return tags[:5]
--- a/backend/app/infrastructure/perception/crawlers/base.py
+++ b/backend/app/infrastructure/perception/crawlers/base.py
@@ -0,0 +1,32 @@
+"""Shared contracts for regulatory source crawlers."""
+
+from __future__ import annotations
+
+from abc import ABC, abstractmethod
+from dataclasses import dataclass, field
+
+
+@dataclass
+class RawEvent:
+    """Raw regulatory event returned by a crawler before enrichment."""
+
+    source: str
+    source_label: str
+    standard_code: str
+    title: str
+    summary: str
+    full_text_url: str
+    status: str           # 'enacted' | 'draft' | 'consultation'
+    published_at: str     # YYYY-MM-DD string
+    effective_at: str | None
+    category: str
+    tags: list[str] = field(default_factory=list)
+    raw_text: str = ""    # full crawled text for hashing + LLM
+
+
+class BaseCrawler(ABC):
+    """Abstract regulatory source crawler."""
+
+    @abstractmethod
+    def fetch(self, limit: int = 50) -> list[RawEvent]:
+        """Fetch up to `limit` recent events from the data source."""
--- a/backend/app/infrastructure/perception/crawlers/catarc_crawler.py
+++ b/backend/app/infrastructure/perception/crawlers/catarc_crawler.py
@@ -0,0 +1,83 @@
+"""Crawler for CATARC automotive standard catalogue."""
+
+from __future__ import annotations
+
+from urllib.parse import urljoin
+
+import httpx
+from bs4 import BeautifulSoup
+from loguru import logger
+
+from app.infrastructure.perception.crawlers.base import BaseCrawler, RawEvent
+from ._utils import extract_tags, parse_date
+
+_BASE_URL = "https://www.catarc.org.cn/bzzxd/qcbz/index.html"
+_HOST = "https://www.catarc.org.cn"
+
+_STATUS_MAP = {
+    "现行": "enacted",
+    "即将实施": "enacted",
+    "废止": "enacted",
+    "征求意见": "consultation",
+    "报批": "draft",
+}
+
+
+class CatarcCrawler(BaseCrawler):
+    """Scrape the CATARC automotive standard list page."""
+
+    def fetch(self, limit: int = 50) -> list[RawEvent]:
+        events: list[RawEvent] = []
+        page = 1
+        max_pages = max(10, limit)
+        while len(events) < limit and page <= max_pages:
+            url = f"{_BASE_URL}?page={page}"
+            try:
+                resp = httpx.get(url, timeout=30, follow_redirects=True)
+                resp.raise_for_status()
+            except Exception as exc:
+                logger.warning("CATARC fetch failed page={} err={}", page, exc)
+                break
+
+            soup = BeautifulSoup(resp.text, "lxml")
+            rows = soup.select("table tr")
+            if not rows:
+                break
+
+            batch: list[RawEvent] = []
+            for row in rows:
+                cells = row.find_all("td")
+                if len(cells) < 3:
+                    continue
+                link = cells[0].find("a")
+                standard_code = link.get_text(strip=True) if link else cells[0].get_text(strip=True)
+                title = cells[1].get_text(strip=True) if len(cells) > 1 else standard_code
+                date_text = cells[2].get_text(strip=True) if len(cells) > 2 else ""
+                published_at = parse_date(date_text)
+                status_text = cells[3].get_text(strip=True) if len(cells) > 3 else ""
+                status = _STATUS_MAP.get(status_text, "enacted")
+                detail_url = urljoin(_HOST, link["href"]) if link and link.get("href") else url
+                raw_text = f"{standard_code} {title}"
+                batch.append(RawEvent(
+                    source="CATARC",
+                    source_label="全国汽车标准化技术委员会",
+                    standard_code=standard_code,
+                    title=title,
+                    summary=title,
+                    full_text_url=detail_url,
+                    status=status,
+                    published_at=published_at,
+                    effective_at=None,
+                    category="汽车标准",
+                    tags=extract_tags(standard_code, title),
+                    raw_text=raw_text,
+                ))
+
+            if not batch:
+                break
+            events.extend(batch)
+            page += 1
+
+        return events[:limit]
+
+
--- a/backend/app/infrastructure/perception/crawlers/eurlex_crawler.py
+++ b/backend/app/infrastructure/perception/crawlers/eurlex_crawler.py
@@ -0,0 +1,117 @@
+"""Crawler for EUR-Lex RSS feeds covering EU AI Act and automotive regulations."""
+
+from __future__ import annotations
+
+import re
+from email.utils import parsedate_to_datetime
+
+import httpx
+from bs4 import BeautifulSoup
+from loguru import logger
+
+from app.infrastructure.perception.crawlers.base import BaseCrawler, RawEvent
+from ._utils import parse_date
+
+_EURLEX_RSS_URLS = [
+    "https://eur-lex.europa.eu/rss-feed/OJ-L.rss",
+]
+
+_AUTOMOTIVE_KEYWORDS = [
+    "vehicle", "automotive", "motor", "tyre", "emission", "ADAS", "autonomous",
+    "AI Act", "artificial intelligence", "cybersecurity", "software update",
+    "R155", "R156", "汽车", "车辆",
+]
+
+
+_AUTOMOTIVE_KEYWORDS_LOWER = [kw.lower() for kw in _AUTOMOTIVE_KEYWORDS]
+
+
+def _is_automotive_relevant(title: str, description: str) -> bool:
+    combined = (title + " " + description).lower()
+    return any(kw in combined for kw in _AUTOMOTIVE_KEYWORDS_LOWER)
+
+
+def _extract_celex(url: str) -> str:
+    m = re.search(r"CELEX[:/]([0-9A-Z]+)", url)
+    return m.group(1) if m else ""
+
+
+def _parse_rss_date(rfc2822: str) -> str:
+    try:
+        dt = parsedate_to_datetime(rfc2822)
+        return dt.date().isoformat()
+    except Exception:
+        return parse_date(rfc2822)
+
+
+class EurlexCrawler(BaseCrawler):
+    """Fetch automotive-relevant EU regulations from EUR-Lex RSS feeds."""
+
+    def fetch(self, limit: int = 50) -> list[RawEvent]:
+        events: list[RawEvent] = []
+        for rss_url in _EURLEX_RSS_URLS:
+            if len(events) >= limit:
+                break
+            try:
+                resp = httpx.get(rss_url, timeout=30, follow_redirects=True)
+                resp.raise_for_status()
+            except Exception as exc:
+                logger.warning("EUR-Lex RSS fetch failed url={} err={}", rss_url, exc)
+                continue
+
+            soup = BeautifulSoup(resp.content, "lxml-xml")
+            for item in soup.find_all("item"):
+                if len(events) >= limit:
+                    break
+                title_tag = item.find("title")
+                title = title_tag.get_text(strip=True) if title_tag else ""
+                desc_tag = item.find("description")
+                description = desc_tag.get_text(strip=True) if desc_tag else ""
+                link_tag = item.find("link")
+                link = link_tag.get_text(strip=True) if link_tag else ""
+                pub_date_tag = item.find("pubDate")
+                pub_date = pub_date_tag.get_text(strip=True) if pub_date_tag else ""
+
+                if not _is_automotive_relevant(title, description):
+                    continue
+
+                celex = _extract_celex(link)
+                standard_code = celex if celex else title[:60]
+                published_at = _parse_rss_date(pub_date) if pub_date else ""
+
+                events.append(RawEvent(
+                    source="EUR-Lex",
+                    source_label="欧盟官方公报",
+                    standard_code=standard_code,
+                    title=title,
+                    summary=description[:500],
+                    full_text_url=link,
+                    status="enacted",
+                    published_at=published_at,
+                    effective_at=None,
+                    category="EU法规",
+                    tags=_extract_eurlex_tags(title, description),
+                    raw_text=f"{title}\n{description}",
+                ))
+
+        return events[:limit]
+
+
+def _extract_eurlex_tags(title: str, description: str) -> list[str]:
+    combined = title + " " + description
+    tag_map = {
+        "AI Act": "EU AI Act",
+        "artificial intelligence": "EU AI Act",
+        "R155": "UN R155",
+        "R156": "UN R156",
+        "cybersecurity": "网络安全",
+        "emission": "排放",
+        "autonomous": "自动驾驶",
+        "ADAS": "ADAS",
+    }
+    combined_lower = combined.lower()
+    tags = []
+    for kw, tag in tag_map.items():
+        if kw.lower() in combined_lower:
+            tags.append(tag)
+    return tags[:5]
--- a/backend/app/infrastructure/perception/crawlers/guobiao_crawler.py
+++ b/backend/app/infrastructure/perception/crawlers/guobiao_crawler.py
@@ -0,0 +1,92 @@
+"""Crawlers for the 国标委 (SAMR) standard information platform."""
+
+from __future__ import annotations
+
+import httpx
+from loguru import logger
+
+from app.infrastructure.perception.crawlers.base import BaseCrawler, RawEvent
+from ._utils import extract_tags, parse_date
+
+_BASE_URL = "https://openstd.samr.gov.cn/bzgk/std/std_list_type"
+_HEADERS = {"User-Agent": "Mozilla/5.0 (compatible; RegulatoryBot/1.0)"}
+
+
+def _fetch_page(std_type: int, page: int, page_size: int) -> list[dict]:
+    params = {
+        "p.p1": std_type,
+        "p.p2": "车",
+        "p.p90": "circulation_date",
+        "p.p91": "desc",
+        "p.p6": page,
+        "p.p7": page_size,
+    }
+    try:
+        resp = httpx.get(_BASE_URL, params=params, headers=_HEADERS, timeout=30)
+        resp.raise_for_status()
+        data = resp.json()
+        return data.get("rows", []) or []
+    except Exception as exc:
+        logger.warning("国标委 fetch failed type={} page={} err={}", std_type, page, exc)
+        return []
+
+
+def _row_to_raw_event(row: dict, source_label: str) -> RawEvent:
+    standard_code = row.get("std_code", "")
+    title = row.get("std_name", standard_code)
+    published_at = parse_date(row.get("release_date", ""))
+    effective_at_raw = row.get("implement_date", "")
+    effective_at = parse_date(effective_at_raw) if effective_at_raw else None
+    status_text = row.get("std_status", "")
+    if "征求意见" in status_text:
+        status = "consultation"
+    elif "报批" in status_text or "草案" in status_text:
+        status = "draft"
+    else:
+        status = "enacted"
+    return RawEvent(
+        source="国标委",
+        source_label=source_label,
+        standard_code=standard_code,
+        title=title,
+        summary=title,
+        full_text_url=f"https://openstd.samr.gov.cn/bzgk/std/detail?id={row.get('id', '')}",
+        status=status,
+        published_at=published_at,
+        effective_at=effective_at,
+        category=row.get("std_type", "国家标准"),
+        tags=extract_tags(standard_code, title),
+        raw_text=f"{standard_code} {title}",
+    )
+
+
+class GuobiaoMandatoryCrawler(BaseCrawler):
+    """Fetch mandatory national standards (强制性) related to vehicles."""
+
+    def fetch(self, limit: int = 50) -> list[RawEvent]:
+        events: list[RawEvent] = []
+        page = 1
+        max_pages = max(10, limit)
+        while len(events) < limit and page <= max_pages:
+            rows = _fetch_page(std_type=1, page=page, page_size=20)
+            if not rows:
+                break
+            events.extend(_row_to_raw_event(r, "国标委·强制性") for r in rows)
+            page += 1
+        return events[:limit]
+
+
+class GuobiaoRecommendedCrawler(BaseCrawler):
+    """Fetch recommended national standards (推荐性) related to vehicles."""
+
+    def fetch(self, limit: int = 50) -> list[RawEvent]:
+        events: list[RawEvent] = []
+        page = 1
+        max_pages = max(10, limit)
+        while len(events) < limit and page <= max_pages:
+            rows = _fetch_page(std_type=2, page=page, page_size=20)
+            if not rows:
+                break
+            events.extend(_row_to_raw_event(r, "国标委·推荐性") for r in rows)
+            page += 1
+        return events[:limit]
--- a/backend/app/infrastructure/perception/llm_pipeline.py
+++ b/backend/app/infrastructure/perception/llm_pipeline.py
@@ -0,0 +1,241 @@
+"""LLM-driven pipeline for regulatory event enrichment."""
+
+from __future__ import annotations
+
+import json
+import math
+from typing import Any
+
+from loguru import logger
+
+from app.config.settings import settings
+from app.infrastructure.embedding.openai_compatible_embedding_provider import (
+    OpenAICompatibleEmbeddingProvider,
+)
+from app.services.llm.llm_factory import get_llm_client
+
+_EXTRACT_SYSTEM = (
+    "You are a regulatory compliance expert specialising in automotive standards "
+    "(GB, UN-ECE, ISO, EU). Extract structured information from regulation text. "
+    "Return valid JSON only — no markdown fences, no extra keys."
+)
+
+_ASSESS_SYSTEM = (
+    "You are an automotive compliance analyst. Given a regulation and related document excerpts, "
+    "identify which documents are affected and what actions are required. "
+    "Return a JSON array only."
+)
+
+_DIFF_SYSTEM = (
+    "You are a regulatory change analyst. Given an old and new version of a regulation paragraph, "
+    "classify the type of change and summarise it. "
+    "Return JSON only: {\"change_type\": \"tightened|relaxed|added|removed\", \"summary\": \"...\"}"
+)
+
+_SIMILARITY_THRESHOLD = 0.85
+
+
+def _cosine(a: list[float], b: list[float]) -> float:
+    dot = sum(x * y for x, y in zip(a, b))
+    norm_a = math.sqrt(sum(x * x for x in a))
+    norm_b = math.sqrt(sum(x * x for x in b))
+    if norm_a == 0 or norm_b == 0:
+        return 0.0
+    return dot / (norm_a * norm_b)
+
+
+def _llm_json(client: Any, messages: list[dict]) -> Any:
+    """Call LLM and parse JSON response; return None on failure."""
+    try:
+        resp = client.chat(messages)
+        text = (resp.content or "").strip()
+        if text.startswith("```"):
+            text = text.split("```")[1]
+            if text.startswith("json"):
+                text = text[4:]
+        return json.loads(text)
+    except Exception as exc:
+        logger.warning("LLM JSON parse failed: {}", exc)
+        return None
+
+
+class LlmPipeline:
+    """Three-step enrichment pipeline for crawled regulatory events."""
+
+    def __init__(self) -> None:
+        self._client = get_llm_client(
+            provider=settings.llm_provider,
+            model=settings.llm_model,
+        )
+        self._embedder = OpenAICompatibleEmbeddingProvider()
+
+    # ------------------------------------------------------------------
+    # Step 1: Structure extraction
+    # ------------------------------------------------------------------
+
+    def extract_structure(self, event: dict) -> dict:
+        """Extract obligations, deadlines, scope, penalties, impact_level from event text."""
+        prompt = f"""Extract structured compliance information from this regulation:
+
+Standard: {event.get('standard_code', '')}
+Title: {event.get('title', '')}
+Source: {event.get('source_label', '')}
+Summary: {event.get('summary', '')}
+Tags: {', '.join(event.get('tags') or [])}
+
+Return JSON with exactly these keys:
+{{
+  "obligations": [{{"text": "...", "deontic": "must|shall|may|prohibited", "subject": "...", "object": "...", "condition": ""}}],
+  "deadlines": [{{"date": "YYYY-MM-DD or null", "description": "..."}}],
+  "scope": "one sentence describing who/what this applies to",
+  "penalties": "one sentence on consequences of non-compliance, or null",
+  "impact_level": "high|medium|low"
+}}"""
+
+        messages = [
+            {"role": "system", "content": _EXTRACT_SYSTEM},
+            {"role": "user", "content": prompt},
+        ]
+        result = _llm_json(self._client, messages)
+        if not isinstance(result, dict):
+            return {
+                "obligations": [],
+                "deadlines": [],
+                "scope": "",
+                "penalties": "",
+                "impact_level": "medium",
+            }
+        return result
+
+    # ------------------------------------------------------------------
+    # Step 2: Impact assessment
+    # ------------------------------------------------------------------
+
+    def assess_impact(self, event: dict, retrieval_service: Any) -> list[dict]:
+        """Use RAG to find affected documents and generate recommendations."""
+        obligations = event.get("obligations") or []
+        obligation_texts = " ".join(o.get("text", "") for o in obligations[:3])
+        query = f"{event.get('standard_code', '')} {event.get('title', '')} {obligation_texts}"
+
+        try:
+            chunks = retrieval_service.retrieve(query=query, top_k=5)
+        except Exception as exc:
+            logger.warning("RAG retrieval failed: {}", exc)
+            return []
+
+        if not chunks:
+            return []
+
+        seen: set[str] = set()
+        doc_excerpts: list[dict] = []
+        for chunk in chunks:
+            if chunk.doc_id not in seen:
+                seen.add(chunk.doc_id)
+                doc_excerpts.append({
+                    "doc_id": chunk.doc_id,
+                    "doc_name": chunk.doc_title,
+                    "score": round(float(chunk.score if chunk.score is not None else 0), 4),
+                    "snippet": (chunk.text or "")[:300],
+                    "clause": getattr(chunk, "section_title", "") or "",
+                })
+
+        context = "\n".join(
+            f"[{d['doc_name']} {d['clause']}] score={d['score']}: {d['snippet']}"
+            for d in doc_excerpts
+        )
+        prompt = f"""Regulation: {event.get('standard_code')} — {event.get('title')}
+Obligations: {obligation_texts or event.get('summary', '')}
+
+Affected documents found in knowledge base:
+{context}
+
+For each document, assess impact and recommend action. Return JSON array:
+[{{"doc_id":"...","doc_name":"...","score":0.0,"key_clauses":"...","recommendation":"one sentence action"}}]"""
+
+        messages = [
+            {"role": "system", "content": _ASSESS_SYSTEM},
+            {"role": "user", "content": prompt},
+        ]
+        result = _llm_json(self._client, messages)
+        if isinstance(result, list):
+            score_map = {d["doc_id"]: d["score"] for d in doc_excerpts}
+            for item in result:
+                if isinstance(item, dict) and item.get("doc_id") in score_map:
+                    item["score"] = score_map[item["doc_id"]]
+            return result
+        return doc_excerpts
+
+    # ------------------------------------------------------------------
+    # Step 3: Semantic diff
+    # ------------------------------------------------------------------
+
+    def compute_diff(self, old_text: str, new_text: str) -> dict:
+        """Compare old and new regulation text; return changed sections and summary."""
+        old_paras = [p.strip() for p in old_text.split("\n") if p.strip()]
+        new_paras = [p.strip() for p in new_text.split("\n") if p.strip()]
+
+        if not old_paras or not new_paras:
+            return {"changed_sections": [], "change_summary": "No comparable text."}
+
+        all_paras = old_paras + new_paras
+        try:
+            all_embeddings = self._embedder.embed_texts(all_paras)
+        except Exception as exc:
+            logger.warning("Embedding for diff failed: {}", exc)
+            return {"changed_sections": [], "change_summary": "Diff unavailable (embedding error)."}
+
+        old_embeddings = all_embeddings[: len(old_paras)]
+        new_embeddings = all_embeddings[len(old_paras):]
+
+        changed_sections: list[dict] = []
+        max_len = max(len(old_paras), len(new_paras))
+
+        for i in range(max_len):
+            if i >= len(old_paras):
+                # New paragraph added
+                changed_sections.append({
+                    "old_text": "",
+                    "new_text": new_paras[i][:300],
+                    "similarity": 0.0,
+                    "change_type": "added",
+                    "summary": "New paragraph added.",
+                })
+                continue
+            if i >= len(new_paras):
+                # Old paragraph removed
+                changed_sections.append({
+                    "old_text": old_paras[i][:300],
+                    "new_text": "",
+                    "similarity": 0.0,
+                    "change_type": "removed",
+                    "summary": "Paragraph removed.",
+                })
+                continue
+            # Both exist — compare via embeddings
+            sim = _cosine(old_embeddings[i], new_embeddings[i])
+            if sim < _SIMILARITY_THRESHOLD:
+                messages = [
+                    {"role": "system", "content": _DIFF_SYSTEM},
+                    {"role": "user", "content": f"OLD: {old_paras[i][:500]}\nNEW: {new_paras[i][:500]}"},
+                ]
+                classification = _llm_json(self._client, messages) or {}
+                changed_sections.append({
+                    "old_text": old_paras[i][:300],
+                    "new_text": new_paras[i][:300],
+                    "similarity": round(sim, 3),
+                    "change_type": classification.get("change_type", "modified"),
+                    "summary": classification.get("summary", ""),
+                })
+
+        if not changed_sections:
+            change_summary = "No substantive changes detected between versions."
+        else:
+            types = [s["change_type"] for s in changed_sections]
+            change_summary = (
+                f"{len(changed_sections)} paragraph(s) changed: "
+                + ", ".join(f"{t}" for t in set(types))
+                + ". "
+                + (changed_sections[0].get("summary", "") if changed_sections else "")
+            )
+
+        return {"changed_sections": changed_sections, "change_summary": change_summary}
--- a/backend/app/infrastructure/perception/mock_event_store.py
+++ b/backend/app/infrastructure/perception/mock_event_store.py
@@ -4,6 +4,8 @@ from __future__ import annotations

 from typing import Any

+from app.infrastructure.perception.base_event_store import BaseEventStore
+
 MOCK_EVENTS: list[dict[str, Any]] = [
    # ------------------------------------------------------------------ HIGH
    {
@@ -379,18 +381,18 @@ MOCK_EVENTS: list[dict[str, Any]] = [
    },
 ]

-# Index for fast lookup
-_EVENT_INDEX: dict[str, dict] = {e["id"]: e for e in MOCK_EVENTS}
-
-
-class MockEventStore:
+class MockEventStore(BaseEventStore):
    """In-memory mock store for regulatory events."""

+    def __init__(self) -> None:
+        self._events: list[dict] = [dict(e) for e in MOCK_EVENTS]
+        self._index: dict[str, dict] = {e["id"]: e for e in self._events}
+
    def all(self) -> list[dict]:
-        return list(MOCK_EVENTS)
+        return list(self._events)

    def get(self, event_id: str) -> dict | None:
-        return _EVENT_INDEX.get(event_id)
+        return self._index.get(event_id)

    def filter(
        self,
@@ -399,23 +401,39 @@ class MockEventStore:
        impact_level: str | None = None,
        limit: int = 50,
    ) -> list[dict]:
-        events = list(MOCK_EVENTS)
+        events = list(self._events)
        if source:
            events = [e for e in events if e["source"] == source]
        if impact_level:
            events = [e for e in events if e["impact_level"] == impact_level]
-        events.sort(key=lambda e: e["published_at"], reverse=True)
+        events.sort(key=lambda e: e.get("published_at") or "", reverse=True)
        return events[:limit]

    def stats(self) -> dict:
        from datetime import date, timedelta

-        events = MOCK_EVENTS
+        events = self._events
        cutoff = (date.today() - timedelta(days=90)).isoformat()
        return {
            "total": len(events),
            "high_impact": sum(1 for e in events if e["impact_level"] == "high"),
            "medium_impact": sum(1 for e in events if e["impact_level"] == "medium"),
            "low_impact": sum(1 for e in events if e["impact_level"] == "low"),
-            "recent_90d": sum(1 for e in events if e["published_at"] >= cutoff),
+            "recent_90d": sum(1 for e in events if (e.get("published_at") or "") >= cutoff),
        }
+
+    def upsert(self, event: dict) -> None:
+        """Insert or update event in the in-memory list (used in tests)."""
+        existing = self._index.get(event["id"])
+        if existing:
+            existing.update(event)
+        else:
+            self._events.append(event)
+            self._index[event["id"]] = event
+
+    def get_by_standard_code(self, standard_code: str) -> dict | None:
+        """Return most-recent event with matching standard_code."""
+        matches = [e for e in self._events if e.get("standard_code") == standard_code]
+        if not matches:
+            return None
+        return max(matches, key=lambda e: e.get("published_at", ""))
--- a/backend/app/infrastructure/perception/postgres_event_store.py
+++ b/backend/app/infrastructure/perception/postgres_event_store.py
@@ -0,0 +1,225 @@
+"""PostgreSQL-backed regulatory event store."""
+
+from __future__ import annotations
+
+import json
+from contextlib import contextmanager
+from datetime import UTC, date, datetime, timedelta
+from typing import Any
+
+import psycopg2
+import psycopg2.extras
+from psycopg2.pool import ThreadedConnectionPool
+
+from app.config.settings import settings
+from app.infrastructure.perception.base_event_store import BaseEventStore
+
+_CREATE_TABLE = """
+CREATE TABLE IF NOT EXISTS regulation_events (
+    id               TEXT PRIMARY KEY,
+    source           TEXT NOT NULL,
+    source_label     TEXT,
+    standard_code    TEXT NOT NULL,
+    title            TEXT NOT NULL,
+    summary          TEXT,
+    full_text_url    TEXT,
+    status           TEXT,
+    impact_level     TEXT,
+    published_at     DATE,
+    effective_at     DATE,
+    category         TEXT,
+    tags             TEXT[],
+    obligations      JSONB,
+    deadlines        JSONB,
+    scope            TEXT,
+    penalties        TEXT,
+    content_hash     TEXT,
+    previous_hash    TEXT,
+    change_summary   TEXT,
+    changed_sections JSONB,
+    affected_docs    JSONB,
+    crawled_at       TIMESTAMPTZ DEFAULT now(),
+    processed_at     TIMESTAMPTZ,
+    raw_storage_key  TEXT
+);
+CREATE INDEX IF NOT EXISTS reg_events_source_date
+    ON regulation_events (source, published_at DESC);
+CREATE INDEX IF NOT EXISTS reg_events_impact_date
+    ON regulation_events (impact_level, published_at DESC);
+"""
+
+_ALL_COLUMNS = (
+    "id", "source", "source_label", "standard_code", "title", "summary",
+    "full_text_url", "status", "impact_level", "published_at", "effective_at",
+    "category", "tags", "obligations", "deadlines", "scope", "penalties",
+    "content_hash", "previous_hash", "change_summary", "changed_sections",
+    "affected_docs", "crawled_at", "processed_at", "raw_storage_key",
+)
+
+
+def _row_to_dict(row: dict[str, Any]) -> dict:
+    """Convert a psycopg2 RealDictRow to a plain dict with serialized JSON fields."""
+    d = dict(row)
+    for field in ("obligations", "deadlines", "changed_sections", "affected_docs"):
+        val = d.get(field)
+        if isinstance(val, str):
+            d[field] = json.loads(val)
+    for date_field in ("published_at", "effective_at"):
+        val = d.get(date_field)
+        if isinstance(val, datetime):
+            d[date_field] = val.date().isoformat()
+        elif isinstance(val, date):
+            d[date_field] = val.isoformat()
+    for ts_field in ("crawled_at", "processed_at"):
+        val = d.get(ts_field)
+        if isinstance(val, datetime):
+            d[ts_field] = val.isoformat()
+    return d
+
+
+class PostgresEventStore(BaseEventStore):
+    """Regulatory event store backed by PostgreSQL."""
+
+    def __init__(self) -> None:
+        self._pool = ThreadedConnectionPool(
+            minconn=1,
+            maxconn=5,
+            host=settings.postgres_host,
+            port=settings.postgres_port,
+            user=settings.postgres_user,
+            password=settings.postgres_password,
+            dbname=settings.postgres_db,
+        )
+        self._ensure_schema()
+
+    def _ensure_schema(self) -> None:
+        with self._conn() as conn:
+            try:
+                with conn.cursor() as cur:
+                    cur.execute(_CREATE_TABLE)
+                conn.commit()
+            except Exception:
+                conn.rollback()
+                raise
+
+    @contextmanager
+    def _conn(self):
+        conn = None
+        try:
+            conn = self._pool.getconn()
+            yield conn
+        finally:
+            if conn is not None:
+                self._pool.putconn(conn)
+
+    def all(self) -> list[dict]:
+        with self._conn() as conn:
+            with conn.cursor(cursor_factory=psycopg2.extras.RealDictCursor) as cur:
+                cur.execute(
+                    "SELECT * FROM regulation_events ORDER BY published_at DESC NULLS LAST"
+                )
+                return [_row_to_dict(r) for r in cur.fetchall()]
+
+    def get(self, event_id: str) -> dict | None:
+        with self._conn() as conn:
+            with conn.cursor(cursor_factory=psycopg2.extras.RealDictCursor) as cur:
+                cur.execute(
+                    "SELECT * FROM regulation_events WHERE id = %s", (event_id,)
+                )
+                row = cur.fetchone()
+                return _row_to_dict(row) if row else None
+
+    def filter(
+        self,
+        *,
+        source: str | None = None,
+        impact_level: str | None = None,
+        limit: int = 50,
+    ) -> list[dict]:
+        conditions: list[str] = []
+        params: list[Any] = []
+        if source:
+            conditions.append("source = %s")
+            params.append(source)
+        if impact_level:
+            conditions.append("impact_level = %s")
+            params.append(impact_level)
+        where = ("WHERE " + " AND ".join(conditions)) if conditions else ""
+        params.append(limit)
+        sql = f"""
+            SELECT * FROM regulation_events
+            {where}
+            ORDER BY published_at DESC NULLS LAST
+            LIMIT %s
+        """
+        with self._conn() as conn:
+            with conn.cursor(cursor_factory=psycopg2.extras.RealDictCursor) as cur:
+                cur.execute(sql, params)
+                return [_row_to_dict(r) for r in cur.fetchall()]
+
+    def stats(self) -> dict:
+        cutoff = (date.today() - timedelta(days=90)).isoformat()
+        with self._conn() as conn:
+            with conn.cursor(cursor_factory=psycopg2.extras.RealDictCursor) as cur:
+                cur.execute("SELECT COUNT(*) AS count FROM regulation_events")
+                total = (cur.fetchone() or {}).get("count", 0)
+                cur.execute(
+                    "SELECT COUNT(*) AS count FROM regulation_events WHERE impact_level = 'high'"
+                )
+                high = (cur.fetchone() or {}).get("count", 0)
+                cur.execute(
+                    "SELECT COUNT(*) AS count FROM regulation_events WHERE impact_level = 'medium'"
+                )
+                medium = (cur.fetchone() or {}).get("count", 0)
+                cur.execute(
+                    "SELECT COUNT(*) AS count FROM regulation_events WHERE published_at >= %s",
+                    (cutoff,),
+                )
+                recent = (cur.fetchone() or {}).get("count", 0)
+        return {
+            "total": int(total),
+            "high_impact": int(high),
+            "medium_impact": int(medium),
+            "recent_90d": int(recent),
+        }
+
+    def upsert(self, event: dict) -> None:
+        """Insert or update a regulation event."""
+        cols = [c for c in _ALL_COLUMNS if c in event]
+        placeholders = ", ".join(f"%({c})s" for c in cols)
+        updates = ", ".join(f"{c} = EXCLUDED.{c}" for c in cols if c != "id")
+        sql = f"""
+            INSERT INTO regulation_events ({', '.join(cols)})
+            VALUES ({placeholders})
+            ON CONFLICT (id) DO UPDATE SET {updates}
+        """
+        row: dict[str, Any] = {}
+        for c in cols:
+            val = event.get(c)
+            if c in ("obligations", "deadlines", "changed_sections", "affected_docs") and val is not None:
+                row[c] = json.dumps(val, ensure_ascii=False)
+            elif c == "tags" and isinstance(val, list):
+                row[c] = val
+            else:
+                row[c] = val
+        with self._conn() as conn:
+            try:
+                with conn.cursor() as cur:
+                    cur.execute(sql, row)
+                conn.commit()
+            except Exception:
+                conn.rollback()
+                raise
+
+    def get_by_standard_code(self, standard_code: str) -> dict | None:
+        with self._conn() as conn:
+            with conn.cursor(cursor_factory=psycopg2.extras.RealDictCursor) as cur:
+                cur.execute(
+                    """SELECT * FROM regulation_events
+                       WHERE standard_code = %s
+                       ORDER BY published_at DESC NULLS LAST
+                       LIMIT 1""",
+                    (standard_code,),
+                )
+                row = cur.fetchone()
+                return _row_to_dict(row) if row else None
--- a/backend/app/shared/bootstrap.py
+++ b/backend/app/shared/bootstrap.py
@@ -19,6 +19,15 @@ from app.infrastructure.parser.local_chunk_builder import LocalRegulationChunkBu
 from app.infrastructure.parser.local_document_parser import LocalDocumentParser
 from app.infrastructure.parser.vector_chunk_builder import AliyunVectorChunkBuilder
 from app.infrastructure.perception.mock_event_store import MockEventStore
+from app.application.perception.crawl_service import CrawlService
+from app.infrastructure.perception.base_event_store import BaseEventStore
+from app.infrastructure.perception.crawlers.catarc_crawler import CatarcCrawler
+from app.infrastructure.perception.crawlers.guobiao_crawler import (
+    GuobiaoMandatoryCrawler,
+    GuobiaoRecommendedCrawler,
+)
+from app.infrastructure.perception.crawlers.eurlex_crawler import EurlexCrawler
+from app.infrastructure.perception.llm_pipeline import LlmPipeline
 from app.infrastructure.session.in_memory_conversation_store import InMemoryConversationStore
 from app.infrastructure.storage.json_document_processing_store import JsonDocumentProcessingStore
 from app.infrastructure.storage.json_document_repository import JsonDocumentRepository
@@ -293,11 +302,35 @@ def get_agent_conversation_service() -> AgentConversationService:
    )


+@lru_cache
+def get_event_store() -> BaseEventStore:
+    """Return event store selected by DOCUMENT_REPOSITORY_BACKEND setting."""
+    if settings.document_repository_backend == "postgres":
+        from app.infrastructure.perception.postgres_event_store import PostgresEventStore
+        return PostgresEventStore()
+    return MockEventStore()
+
+
@lru_cache
 def get_perception_service() -> PerceptionService:
-    """Return perception service for regulatory intelligence."""
    return PerceptionService(
-        event_store=MockEventStore(),
+        event_store=get_event_store(),
+        retrieval_service=get_retrieval_service(),
+    )
+
+
+@lru_cache
+def get_crawl_service() -> CrawlService:
+    crawlers = {
+        "CATARC": CatarcCrawler(),
+        "国标委·强制性": GuobiaoMandatoryCrawler(),
+        "国标委·推荐性": GuobiaoRecommendedCrawler(),
+        "EUR-Lex": EurlexCrawler(),
+    }
+    return CrawlService(
+        crawlers=crawlers,
+        event_store=get_event_store(),
+        llm_pipeline=LlmPipeline(),
        retrieval_service=get_retrieval_service(),
    )